From cee9c5eeb1a5d95339affd384c82e582c0fedbc5 Mon Sep 17 00:00:00 2001 From: Noel Chalmers Date: Tue, 7 Jun 2022 19:21:50 -0500 Subject: [PATCH] Update to v0.5.0 (#78) * Update OCCA * Remove gslib * Upgrade to newest ogs * Add new comm_t object * Add new memory objects * Add new timers * Move matrix routines into linAlg * Rename file * [Core] Update core lib * [LinAlg] Update linAlg lib * [LinAlg] Rename some source files * [Core] Remove ambiguous scan overload * [OCCA] Update to latest OCCA * [Mesh] Move some mesh files * [ParAdogs] Add parAdogs mesh partitioner * [Mesh] Update mesh library * [Core] Updates to core library * [LinAlg] Update linear algebra library * [TimeStepper] Update timeStepper library * [LinearSolver] Update linearSolver library * [ParAlmond] Update parAlmond library * [Make] Makefile updates * [Advection] Update advection solver * [Acoustics] Update acoustics solver * [Gradient] Update gradient solver * [Elliptic] Update elliptic solver * [CNS] Update compressible navier stokes solver * [Test] Some test tweaks * [LBS] Update DG Lattice Boltzmann solver * [LBS] Makefile tweak * [BNS] Update Galerkin Boltzmann Navier-Stokes solver * [FPE] Update Fokker-Planck solver * [INS] Update incompressible Navier-Stokes solver * [LinAlg] LinAlg fixes * [InitialGuess] FIxes in initial guess strategies * [Mesh] Initialize mapB to -1 * [Mesh] mapB in mesh_t class * [Mesh] Bug fix in cubature setup * [Mesh] Bugifx in multirate setup * [ParAlmond] Switch to shared_ptr to address mem leak * [Test] Tweak one golden norm * [Test] Add ParAdogs tests * [Solvers] Switch to strings for filenames and kernelname to avoid memory leaks * Version bump * Happy New Year * Need some manual barriers * [Solvers] Trigger ogs kernel builds during setups * [Git] Ignore rc files made by tests * [Make] Fix default openblas-serial lib path * [Git] Use openblas-serial in git workflow * [Comm] Move some things out of the comm_t class, and into a namespace * [Mesh] Make the element type an enum * [Make] Missing code coverage flags * [OGS] Add some more explicit instantiations to workaround issue in older gcc * [Timer] Can't use a the plaform's comm at the end of ogsSetup. Adding a new timer with comm argument * Update README.md * Add code diagram * Update README.md * [LinearSolver] Bugfix for uninitialized memory * [Libs][Solvers] Small fixes for building in FP32 mode * [Core] Remove repeated compiler flags * [Core] Workaround for occa dtypes being empty for user-types * [ParAdogs] Reduce a termination tolerance when in FP32 mode * [ParAdogs] Typo * [OCCA] Switch to OCCA dev branch * [Core] Add a check to not exceed the max thread count OpenMP reports * [OGS] Add some restricts to pointers for host operators * [OCCA] Fix properties syntax * [OGS] Add special code paths for scalar ogs ops * Add some notes on CPU binding to the README * [Acoustics] Hide more comm time with the surface kernel * [OGS] Fix final row block entry being too large * [OCCA] Update to OCCA v1.3 --- .github/CodeDiagram.png | Bin 0 -> 43595 bytes .github/workflows/build.yml | 4 +- .gitignore | 3 +- 3rdParty/gslib/.travis.yml | 36 - 3rdParty/gslib/LICENSE | 58 - 3rdParty/gslib/Makefile | 187 -- 3rdParty/gslib/README.md | 23 - 3rdParty/gslib/RELEASE.md | 17 - 3rdParty/gslib/cdep.py | 33 - 3rdParty/gslib/makefile.cdep | 42 - 3rdParty/gslib/odep_info.py | 50 - 3rdParty/gslib/src/c99.h | 16 - 3rdParty/gslib/src/comm.c | 210 -- 3rdParty/gslib/src/comm.h | 259 --- 3rdParty/gslib/src/crs.h | 36 - 3rdParty/gslib/src/crystal.c | 141 -- 3rdParty/gslib/src/crystal.h | 21 - 3rdParty/gslib/src/fail.c | 63 - 3rdParty/gslib/src/fail.h | 52 - 3rdParty/gslib/src/fcrystal.c | 191 -- 3rdParty/gslib/src/findpts.c | 369 ---- 3rdParty/gslib/src/findpts.h | 73 - 3rdParty/gslib/src/findpts_el.h | 122 -- 3rdParty/gslib/src/findpts_el_2.c | 819 ------- 3rdParty/gslib/src/findpts_el_3.c | 1318 ----------- 3rdParty/gslib/src/findpts_imp.h | 470 ---- 3rdParty/gslib/src/findpts_local.c | 52 - 3rdParty/gslib/src/findpts_local.h | 96 - 3rdParty/gslib/src/findpts_local_imp.h | 388 ---- 3rdParty/gslib/src/gen_poly_imp.c | 226 -- 3rdParty/gslib/src/gs.c | 1651 -------------- 3rdParty/gslib/src/gs.h | 155 -- 3rdParty/gslib/src/gs_defs.h | 81 - 3rdParty/gslib/src/gs_local.c | 336 --- 3rdParty/gslib/src/gs_local.h | 43 - 3rdParty/gslib/src/gslib.h | 20 - 3rdParty/gslib/src/lob_bnd.c | 285 --- 3rdParty/gslib/src/lob_bnd.h | 111 - 3rdParty/gslib/src/mem.h | 168 -- 3rdParty/gslib/src/name.h | 44 - 3rdParty/gslib/src/obbox.c | 341 --- 3rdParty/gslib/src/obbox.h | 113 - 3rdParty/gslib/src/poly.c | 236 -- 3rdParty/gslib/src/poly.h | 65 - 3rdParty/gslib/src/poly_imp.h | 1949 ----------------- 3rdParty/gslib/src/rand_elt_test.c | 169 -- 3rdParty/gslib/src/rand_elt_test.h | 18 - 3rdParty/gslib/src/sarray_sort.c | 45 - 3rdParty/gslib/src/sarray_sort.h | 89 - 3rdParty/gslib/src/sarray_transfer.c | 198 -- 3rdParty/gslib/src/sarray_transfer.h | 95 - 3rdParty/gslib/src/sort.c | 31 - 3rdParty/gslib/src/sort.h | 76 - 3rdParty/gslib/src/sort_imp.h | 544 ----- 3rdParty/gslib/src/tensor.c | 82 - 3rdParty/gslib/src/tensor.h | 199 -- 3rdParty/gslib/src/types.h | 85 - 3rdParty/gslib/tests/comm_test.c | 37 - 3rdParty/gslib/tests/crystal_test.c | 88 - 3rdParty/gslib/tests/findpts_el_2_test.c | 73 - 3rdParty/gslib/tests/findpts_el_2_test2.c | 97 - 3rdParty/gslib/tests/findpts_el_3_test.c | 77 - 3rdParty/gslib/tests/findpts_el_3_test2.c | 107 - 3rdParty/gslib/tests/findpts_local_test.c | 210 -- 3rdParty/gslib/tests/findpts_test.c | 328 --- 3rdParty/gslib/tests/fortran/f-igs.f | 59 - 3rdParty/gslib/tests/gs_test.c | 133 -- 3rdParty/gslib/tests/gs_test_gop_blocking.c | 107 - .../gslib/tests/gs_test_gop_nonblocking.c | 131 -- 3rdParty/gslib/tests/gs_test_old.c | 148 -- 3rdParty/gslib/tests/gs_unique_test.c | 87 - 3rdParty/gslib/tests/lob_bnd_test.c | 185 -- 3rdParty/gslib/tests/obbox_test.c | 207 -- 3rdParty/gslib/tests/poly_test.c | 23 - 3rdParty/gslib/tests/run_tests.sh | 21 - 3rdParty/gslib/tests/sarray_sort_test.c | 47 - 3rdParty/gslib/tests/sarray_transfer_test.c | 93 - 3rdParty/gslib/tests/sort_test.c | 113 - 3rdParty/gslib/tests/sort_test2.c | 58 - LICENSE | 2 +- README.md | 87 +- include/comm.hpp | 565 +++++ include/core.hpp | 58 +- include/initialGuess.hpp | 148 +- include/linAlg.hpp | 189 +- include/linearSolver.hpp | 188 +- include/memory.hpp | 778 +++++++ include/mesh.hpp | 1338 +++++++---- include/mesh/mesh2D.hpp | 94 - include/mesh/mesh3D.hpp | 141 -- include/mesh/meshDefines3D.h | 99 - include/ogs.hpp | 551 +++-- include/ogs/ogsBase.hpp | 112 + include/ogs/ogsDefs.h | 65 - include/ogs/ogsExchange.hpp | 336 +++ include/ogs/ogsKernels.hpp | 203 -- include/ogs/ogsOperator.hpp | 147 ++ include/ogs/ogsUtils.hpp | 87 + .../operator.hpp | 30 +- .../parAdogs.hpp | 76 +- include/parAdogs/parAdogsGraph.hpp | 159 ++ include/parAdogs/parAdogsMatrix.hpp | 128 ++ include/parAdogs/parAdogsMultigrid.hpp | 121 + .../parAdogsPartition.hpp} | 44 +- include/parAlmond.hpp | 189 +- include/parAlmond/parAlmondAMGLevel.hpp | 27 +- include/parAlmond/parAlmondAMGSetup.hpp | 48 +- include/parAlmond/parAlmondCoarseSolver.hpp | 71 +- include/parAlmond/parAlmondDefines.hpp | 18 +- include/parAlmond/parAlmondKernels.hpp | 47 +- include/parAlmond/parAlmondMultigrid.hpp | 123 -- include/parAlmond/parAlmondparCSR.hpp | 103 +- include/platform.hpp | 198 +- include/precon.hpp | 38 +- include/settings.hpp | 53 +- include/solver.hpp | 75 +- include/timeStepper.hpp | 520 ++--- include/timer.hpp | 59 + include/types.h | 16 +- include/utils.hpp | 109 +- libs/core/comm.cpp | 118 + libs/core/exception.cpp | 90 + libs/core/factor.cpp | 89 - libs/core/matrixEig.cpp | 178 -- libs/core/matrixRightSolve.cpp | 301 --- .../memory.cpp} | 35 +- libs/core/parallelSort.cpp | 145 -- libs/core/platformBuildKernel.cpp | 23 +- libs/core/platformDeviceConfig.cpp | 159 +- libs/core/platformProperties.cpp | 98 +- libs/core/platformSettings.cpp | 19 +- libs/core/rankDecomp.cpp | 215 ++ libs/core/settings.cpp | 101 +- .../core/timer.cpp | 70 +- libs/linAlg/linAlg.cpp | 175 +- .../linAlgMatrixConditionNumber.cpp} | 92 +- libs/linAlg/linAlgMatrixEig.cpp | 152 ++ .../linAlgMatrixInverse.cpp} | 64 +- libs/linAlg/linAlgMatrixRightSolve.cpp | 381 ++++ .../linAlgMatrixTranspose.cpp} | 46 +- libs/linAlg/linAlgSetup.cpp | 122 +- libs/linAlg/okl/linAlgADXPY.okl | 4 +- libs/linAlg/okl/linAlgAMXPY.okl | 2 +- libs/linAlg/okl/linAlgAXPY.okl | 4 +- libs/linAlg/okl/linAlgAdd.okl | 2 +- libs/linAlg/okl/linAlgInnerProd.okl | 69 +- libs/linAlg/okl/linAlgMax.okl | 72 +- libs/linAlg/okl/linAlgMin.okl | 72 +- libs/linAlg/okl/linAlgNorm2.okl | 96 +- libs/linAlg/okl/linAlgScale.okl | 2 +- libs/linAlg/okl/linAlgSet.okl | 2 +- libs/linAlg/okl/linAlgSum.okl | 71 +- libs/linAlg/okl/linAlgWeightedInnerProd.okl | 132 +- libs/linAlg/okl/linAlgWeightedNorm2.okl | 2 +- libs/linearSolver/initialGuess.cpp | 351 +-- libs/linearSolver/linearSolver.cpp | 52 +- libs/linearSolver/linearSolverNBFPCG.cpp | 152 +- libs/linearSolver/linearSolverNBPCG.cpp | 128 +- libs/linearSolver/linearSolverPCG.cpp | 88 +- libs/linearSolver/linearSolverPGMRES.cpp | 71 +- libs/linearSolver/linearSolverPMINRES.cpp | 71 +- .../linearSolver/okl/igBasisInnerProducts.okl | 11 +- libs/linearSolver/okl/igDropQRFirstColumn.okl | 6 +- libs/linearSolver/okl/igExtrap.okl | 2 +- libs/linearSolver/okl/igReconstruct.okl | 3 +- libs/linearSolver/okl/igScale.okl | 2 +- libs/linearSolver/okl/igUpdate.okl | 2 +- .../okl/linearSolverUpdateMINRES.okl | 2 +- .../okl/linearSolverUpdateNBFPCG.okl | 26 +- .../okl/linearSolverUpdateNBPCG.okl | 26 +- .../okl/linearSolverUpdatePCG.okl | 19 +- libs/makefile | 101 +- libs/mesh/mesh.cpp | 67 - libs/mesh/meshBasis1D.cpp | 233 +- libs/mesh/meshBasisHex3D.cpp | 393 +++- libs/mesh/meshBasisQuad2D.cpp | 315 ++- libs/mesh/meshBasisTet3D.cpp | 688 ++++-- libs/mesh/meshBasisTri2D.cpp | 685 +++--- libs/mesh/meshConnect.cpp | 273 ++- libs/mesh/meshConnectBoundary.cpp | 39 +- libs/mesh/meshConnectFaceNodes.cpp | 103 + libs/mesh/meshConnectFaceNodes2D.cpp | 138 -- libs/mesh/meshConnectFaceNodes3D.cpp | 145 -- libs/mesh/meshConnectFaceVertices.cpp | 66 + libs/mesh/meshConnectNodes.cpp | 119 + libs/mesh/meshCubatureNodesHex3D.cpp | 67 +- libs/mesh/meshCubatureNodesQuad2D.cpp | 42 +- libs/mesh/meshCubatureNodesQuad3D.cpp | 54 +- libs/mesh/meshCubatureNodesTet3D.cpp | 42 +- libs/mesh/meshCubatureNodesTri2D.cpp | 27 +- libs/mesh/meshCubatureNodesTri3D.cpp | 33 +- libs/mesh/meshCubatureSetupHex3D.cpp | 169 +- libs/mesh/meshCubatureSetupQuad2D.cpp | 95 +- libs/mesh/meshCubatureSetupTet3D.cpp | 84 +- libs/mesh/meshCubatureSetupTri2D.cpp | 83 +- ...erSetup.cpp => meshGatherScatterSetup.cpp} | 78 +- libs/mesh/meshGeometricFactorsHex3D.cpp | 87 +- libs/mesh/meshGeometricFactorsQuad2D.cpp | 67 +- libs/mesh/meshGeometricFactorsQuad3D.cpp | 475 ++-- libs/mesh/meshGeometricFactorsTet3D.cpp | 89 +- libs/mesh/meshGeometricFactorsTri2D.cpp | 60 +- libs/mesh/meshGeometricFactorsTri3D.cpp | 91 +- libs/mesh/meshGeometricPartition2D.cpp | 396 ---- libs/mesh/meshGeometricPartition3D.cpp | 354 --- libs/mesh/meshHaloRingSetup.cpp | 190 +- libs/mesh/meshHaloSetup.cpp | 46 +- libs/mesh/meshHaloTraceSetup.cpp | 36 +- libs/mesh/meshMassMatrixApply.cpp | 39 +- libs/mesh/meshMinCharacteristicLength.cpp | 73 +- libs/mesh/meshMultiRateHaloTraceSetup.cpp | 55 +- libs/mesh/meshMultiRateSetup.cpp | 84 +- libs/mesh/meshOccaSetup.cpp | 62 - libs/mesh/meshOccaSetup2D.cpp | 60 - libs/mesh/meshOccaSetup3D.cpp | 77 - libs/mesh/meshOccaSetupHex3D.cpp | 88 - libs/mesh/meshOccaSetupTet3D.cpp | 80 - libs/mesh/meshOccaSetupTri2D.cpp | 72 - libs/mesh/meshOccaSetupTri3D.cpp | 72 - libs/mesh/meshParallelConnectNodes.cpp | 110 - libs/mesh/meshParallelConnectOpt.cpp | 233 -- libs/mesh/meshParallelReaderQuad3D.cpp | 227 -- libs/mesh/meshParallelReaderTri3D.cpp | 223 -- libs/mesh/meshPartition.cpp | 51 + libs/mesh/meshPhysicalNodesHex3D.cpp | 31 +- libs/mesh/meshPhysicalNodesQuad2D.cpp | 26 +- libs/mesh/meshPhysicalNodesQuad3D.cpp | 31 +- libs/mesh/meshPhysicalNodesTet3D.cpp | 30 +- libs/mesh/meshPhysicalNodesTri2D.cpp | 25 +- libs/mesh/meshPhysicalNodesTri3D.cpp | 31 +- libs/mesh/meshPlotInterpHex3D.cpp | 33 +- libs/mesh/meshPlotInterpQuad2D.cpp | 27 +- libs/mesh/meshPlotInterpQuad3D.cpp | 78 - libs/mesh/meshPlotInterpTet3D.cpp | 9 +- libs/mesh/meshPlotInterpTri2D.cpp | 9 +- libs/mesh/meshPmlSetup.cpp | 55 +- ...lReaderHex3D.cpp => meshReadGmshHex3D.cpp} | 123 +- ...eaderQuad2D.cpp => meshReadGmshQuad2D.cpp} | 118 +- libs/mesh/meshReadGmshQuad3D.cpp | 189 ++ ...lReaderTet3D.cpp => meshReadGmshTet3D.cpp} | 121 +- ...lReaderTri2D.cpp => meshReadGmshTri2D.cpp} | 118 +- libs/mesh/meshReadGmshTri3D.cpp | 184 ++ libs/mesh/meshReferenceNodesHex3D.cpp | 49 +- libs/mesh/meshReferenceNodesQuad2D.cpp | 54 +- libs/mesh/meshReferenceNodesTet3D.cpp | 79 +- libs/mesh/meshReferenceNodesTri2D.cpp | 74 +- libs/mesh/meshSetElementType.cpp | 90 + libs/mesh/meshSettings.cpp | 20 +- libs/mesh/meshSetup.cpp | 110 +- libs/mesh/meshSetupBoxHex3D.cpp | 67 +- libs/mesh/meshSetupBoxQuad2D.cpp | 60 +- libs/mesh/meshSetupBoxTet3D.cpp | 55 +- libs/mesh/meshSetupBoxTri2D.cpp | 55 +- libs/mesh/meshSetupNewDegree.cpp | 114 +- libs/mesh/meshSetupPmlBoxHex3D.cpp | 76 +- libs/mesh/meshSetupPmlBoxQuad2D.cpp | 93 +- libs/mesh/meshSetupPmlBoxTet3D.cpp | 67 +- libs/mesh/meshSetupPmlBoxTri2D.cpp | 57 +- libs/mesh/meshSetupRingPatch.cpp | 204 +- libs/mesh/meshSetupSEMFEM.cpp | 259 +-- .../mesh/meshSurfaceGeometricFactorsHex3D.cpp | 138 +- .../meshSurfaceGeometricFactorsQuad2D.cpp | 79 +- .../meshSurfaceGeometricFactorsQuad3D.cpp | 358 +-- .../mesh/meshSurfaceGeometricFactorsTet3D.cpp | 121 +- .../mesh/meshSurfaceGeometricFactorsTri2D.cpp | 99 +- .../mesh/meshSurfaceGeometricFactorsTri3D.cpp | 174 +- libs/mesh/okl/MassMatrixOperatorHex3D.okl | 9 +- libs/mesh/okl/MassMatrixOperatorQuad2D.okl | 9 +- libs/mesh/okl/MassMatrixOperatorTet3D.okl | 9 +- libs/mesh/okl/MassMatrixOperatorTri2D.okl | 9 +- libs/ogs/gs.cpp | 132 -- libs/ogs/hostGather.cpp | 164 -- libs/ogs/hostGatherScatter.cpp | 180 -- libs/ogs/hostScatter.cpp | 155 -- libs/ogs/occaGather.cpp | 170 -- libs/ogs/occaGatherScatter.cpp | 176 -- libs/ogs/occaGatheredHaloExchange.cpp | 123 -- libs/ogs/occaScatter.cpp | 165 -- libs/ogs/ogs.cpp | 638 ++++-- libs/ogs/ogsAllToAll.cpp | 358 +++ libs/ogs/ogsAuto.cpp | 349 +++ libs/ogs/ogsCrystalRouter.cpp | 775 +++++++ libs/ogs/ogsHalo.cpp | 395 ++++ libs/ogs/ogsKernels.cpp | 132 -- libs/ogs/ogsOperator.cpp | 635 ++++++ libs/ogs/ogsPairwise.cpp | 430 ++++ libs/ogs/ogsSetup.cpp | 1184 ++++++---- libs/ogs/ogsUtils.cpp | 127 ++ libs/ogs/okl/gatherScatter.okl | 185 -- libs/ogs/okl/ogsKernels.okl | 177 ++ libs/parAdogs/parAdogsConnect.cpp | 242 ++ libs/parAdogs/parAdogsCuthillMckee.cpp | 153 ++ libs/parAdogs/parAdogsFiedlerVector.cpp | 176 ++ libs/parAdogs/parAdogsGraph.cpp | 426 ++++ libs/parAdogs/parAdogsInertialBipartition.cpp | 201 ++ libs/parAdogs/parAdogsInertialPartition.cpp | 60 + libs/parAdogs/parAdogsMatrix.cpp | 416 ++++ libs/parAdogs/parAdogsMeshPartition.cpp | 114 + libs/parAdogs/parAdogsMultigrid.cpp | 94 + libs/parAdogs/parAdogsMultigridAggregate.cpp | 315 +++ .../parAdogsMultigridCoarseSolver.cpp | 148 ++ libs/parAdogs/parAdogsMultigridLaplacian.cpp | 166 ++ libs/parAdogs/parAdogsMultigridSetup.cpp | 190 ++ libs/parAdogs/parAdogsMultigridSmooth.cpp | 177 ++ .../parAdogsMultigridSmoothPrologator.cpp | 332 +++ libs/parAdogs/parAdogsMultigridSpMM.cpp | 294 +++ .../parAdogsMultigridTentativeProlongator.cpp | 101 + libs/parAdogs/parAdogsMultigridTranspose.cpp | 177 ++ libs/parAdogs/parAdogsParallelPivot.cpp | 105 + libs/parAdogs/parAdogsRefine.cpp | 141 ++ .../parAdogsSettings.cpp} | 30 +- libs/parAdogs/parAdogsSolve.cpp | 131 ++ libs/parAdogs/parAdogsSpectralBipartition.cpp | 73 + libs/parAdogs/parAdogsSpectralPartition.cpp | 61 + libs/parAlmond/okl/SmoothChebyshev.okl | 4 +- libs/parAlmond/okl/SmoothJacobi.okl | 4 +- libs/parAlmond/okl/SpMVcsr.okl | 2 +- libs/parAlmond/okl/SpMVmcsr.okl | 2 +- libs/parAlmond/okl/dGEMV.okl | 2 +- libs/parAlmond/okl/kcycleCombinedOp.okl | 44 +- libs/parAlmond/okl/vectorAddInnerProd.okl | 24 +- libs/parAlmond/parAlmond.cpp | 52 +- libs/parAlmond/parAlmondAMGLevel.cpp | 110 +- libs/parAlmond/parAlmondAMGSetup.cpp | 116 +- libs/parAlmond/parAlmondAMGSmoother.cpp | 82 +- libs/parAlmond/parAlmondCoarseExact.cpp | 190 +- libs/parAlmond/parAlmondCoarseOAS.cpp | 221 +- libs/parAlmond/parAlmondCoarsenLevel.cpp | 55 +- libs/parAlmond/parAlmondFormAggregates.cpp | 106 +- libs/parAlmond/parAlmondGalerkinProd.cpp | 116 +- libs/parAlmond/parAlmondKcycle.cpp | 198 +- libs/parAlmond/parAlmondKernels.cpp | 115 +- libs/parAlmond/parAlmondMultigrid.cpp | 81 +- libs/parAlmond/parAlmondSettings.cpp | 10 +- libs/parAlmond/parAlmondSmoothPrologator.cpp | 204 +- libs/parAlmond/parAlmondSpMM.cpp | 174 +- libs/parAlmond/parAlmondStrongGraph.cpp | 164 +- .../parAlmondTentativeProlongator.cpp | 61 +- libs/parAlmond/parAlmondTranspose.cpp | 88 +- libs/parAlmond/parAlmondVcycle.cpp | 28 +- libs/parAlmond/parAlmondparCSR.cpp | 231 +- libs/timeStepper/okl/timeStepperAB.okl | 2 +- libs/timeStepper/okl/timeStepperDOPRI5.okl | 11 +- libs/timeStepper/okl/timeStepperEXTBDF.okl | 2 +- libs/timeStepper/okl/timeStepperLSERK4.okl | 2 +- libs/timeStepper/okl/timeStepperMRAB.okl | 10 +- libs/timeStepper/okl/timeStepperMRSAAB.okl | 10 +- libs/timeStepper/okl/timeStepperSAAB.okl | 4 +- libs/timeStepper/okl/timeStepperSARK.okl | 10 +- libs/timeStepper/okl/timeStepperSSBDF.okl | 2 +- libs/timeStepper/timeStepper.cpp | 59 + libs/timeStepper/timeStepperAB3.cpp | 80 +- libs/timeStepper/timeStepperDOPRI5.cpp | 180 +- libs/timeStepper/timeStepperEXTBDF3.cpp | 77 +- libs/timeStepper/timeStepperLSERK4.cpp | 89 +- libs/timeStepper/timeStepperMRAB3.cpp | 139 +- libs/timeStepper/timeStepperMRSAAB3.cpp | 184 +- libs/timeStepper/timeStepperSAAB3.cpp | 118 +- libs/timeStepper/timeStepperSARK4.cpp | 233 +- libs/timeStepper/timeStepperSARK5.cpp | 234 +- libs/timeStepper/timeStepperSSBDF3.cpp | 63 +- make.top | 66 +- makefile | 2 +- occa | 2 +- solvers/acoustics/acoustics.hpp | 45 +- solvers/acoustics/acousticsMain.cpp | 49 +- solvers/acoustics/data/acousticsGaussian2D.h | 2 +- solvers/acoustics/data/acousticsGaussian3D.h | 4 +- solvers/acoustics/makefile | 19 +- .../okl/acousticsInitialCondition2D.okl | 2 +- .../okl/acousticsInitialCondition3D.okl | 2 +- .../acoustics/okl/acousticsSurfaceHex3D.okl | 46 +- .../acoustics/okl/acousticsSurfaceQuad2D.okl | 41 +- .../acoustics/okl/acousticsSurfaceTet3D.okl | 33 +- .../acoustics/okl/acousticsSurfaceTri2D.okl | 29 +- .../acoustics/okl/acousticsVolumeHex3D.okl | 4 +- .../acoustics/okl/acousticsVolumeQuad2D.okl | 4 +- .../acoustics/okl/acousticsVolumeTet3D.okl | 10 +- .../acoustics/okl/acousticsVolumeTri2D.okl | 4 +- solvers/acoustics/src/acousticsPlotFields.cpp | 46 +- solvers/acoustics/src/acousticsReport.cpp | 8 +- solvers/acoustics/src/acousticsRun.cpp | 8 +- solvers/acoustics/src/acousticsSettings.cpp | 23 +- solvers/acoustics/src/acousticsSetup.cpp | 128 +- solvers/acoustics/src/acousticsStep.cpp | 51 +- solvers/advection/advection.hpp | 47 +- solvers/advection/advectionMain.cpp | 47 +- solvers/advection/data/advectionLinear2D.h | 2 +- solvers/advection/data/advectionLinear3D.h | 2 +- solvers/advection/makefile | 17 +- .../okl/advectionInitialCondition2D.okl | 2 +- .../okl/advectionInitialCondition3D.okl | 2 +- .../okl/advectionMaxWaveSpeedHex3D.okl | 6 +- .../okl/advectionMaxWaveSpeedQuad2D.okl | 6 +- .../okl/advectionMaxWaveSpeedTet3D.okl | 4 +- .../okl/advectionMaxWaveSpeedTri2D.okl | 4 +- .../advection/okl/advectionSurfaceHex3D.okl | 7 +- .../advection/okl/advectionSurfaceQuad2D.okl | 8 +- .../advection/okl/advectionSurfaceTet3D.okl | 5 +- .../advection/okl/advectionSurfaceTri2D.okl | 5 +- .../advection/okl/advectionVolumeHex3D.okl | 4 +- .../advection/okl/advectionVolumeQuad2D.okl | 6 +- .../advection/okl/advectionVolumeTet3D.okl | 4 +- .../advection/okl/advectionVolumeTri2D.okl | 4 +- solvers/advection/src/advectionPlotFields.cpp | 40 +- solvers/advection/src/advectionReport.cpp | 8 +- solvers/advection/src/advectionRun.cpp | 8 +- solvers/advection/src/advectionSettings.cpp | 23 +- solvers/advection/src/advectionSetup.cpp | 127 +- solvers/advection/src/advectionStep.cpp | 17 +- solvers/bns/bns.hpp | 119 +- solvers/bns/bnsMain.cpp | 49 +- solvers/bns/data/bnsGaussian2D.h | 2 +- solvers/bns/data/bnsGaussian3D.h | 2 +- solvers/bns/data/bnsUniform2D.h | 2 +- solvers/bns/data/bnsUniform3D.h | 2 +- solvers/bns/makefile | 17 +- solvers/bns/okl/bnsConstrainQuad3D.okl | 44 +- solvers/bns/okl/bnsInitialCondition2D.okl | 2 +- solvers/bns/okl/bnsInitialCondition3D.okl | 2 +- solvers/bns/okl/bnsIsoSurface3D.okl | 4 +- solvers/bns/okl/bnsRelaxationHex3D.okl | 35 +- solvers/bns/okl/bnsRelaxationQuad2D.okl | 16 +- solvers/bns/okl/bnsRelaxationQuad3D.okl | 297 ++- solvers/bns/okl/bnsRelaxationTet3D.okl | 7 +- solvers/bns/okl/bnsRelaxationTri2D.okl | 6 +- solvers/bns/okl/bnsSurfaceHex3D.okl | 18 +- solvers/bns/okl/bnsSurfaceQuad2D.okl | 14 +- solvers/bns/okl/bnsSurfaceQuad3D.okl | 205 +- solvers/bns/okl/bnsSurfaceTet3D.okl | 6 +- solvers/bns/okl/bnsSurfaceTri2D.okl | 6 +- solvers/bns/okl/bnsVolumeHex3D.okl | 5 +- solvers/bns/okl/bnsVolumeQuad2D.okl | 5 +- solvers/bns/okl/bnsVolumeQuad3D.okl | 337 ++- solvers/bns/okl/bnsVolumeTet3D.okl | 5 +- solvers/bns/okl/bnsVolumeTri2D.okl | 7 +- solvers/bns/okl/bnsVorticityHex3D.okl | 5 +- solvers/bns/okl/bnsVorticityQuad2D.okl | 5 +- solvers/bns/okl/bnsVorticityQuad3D.okl | 3 +- solvers/bns/okl/bnsVorticityTet3D.okl | 3 +- solvers/bns/okl/bnsVorticityTri2D.okl | 3 +- solvers/bns/src/bnsPlotFields.cpp | 57 +- solvers/bns/src/bnsPmlSetup.cpp | 22 +- solvers/bns/src/bnsReport.cpp | 8 +- solvers/bns/src/bnsRun.cpp | 10 +- solvers/bns/src/bnsSettings.cpp | 23 +- solvers/bns/src/bnsSetup.cpp | 248 +-- solvers/bns/src/bnsStep.cpp | 64 +- solvers/cns/cns.hpp | 71 +- solvers/cns/cnsMain.cpp | 49 +- solvers/cns/data/cnsGaussian2D.h | 2 +- solvers/cns/data/cnsGaussian3D.h | 2 +- solvers/cns/data/cnsUniform2D.h | 2 +- solvers/cns/data/cnsUniform3D.h | 2 +- solvers/cns/data/cnsVortexDipole2D.h | 2 +- solvers/cns/makefile | 17 +- solvers/cns/okl/cnsConstrainQuad3D.okl | 44 +- solvers/cns/okl/cnsCubatureSurfaceHex3D.okl | 24 +- solvers/cns/okl/cnsCubatureSurfaceQuad2D.okl | 12 +- solvers/cns/okl/cnsCubatureSurfaceQuad3D.okl | 758 ++++--- solvers/cns/okl/cnsCubatureSurfaceTet3D.okl | 8 +- solvers/cns/okl/cnsCubatureSurfaceTri2D.okl | 4 +- solvers/cns/okl/cnsCubatureVolumeHex3D.okl | 14 +- solvers/cns/okl/cnsCubatureVolumeQuad2D.okl | 9 +- solvers/cns/okl/cnsCubatureVolumeQuad3D.okl | 244 +-- solvers/cns/okl/cnsCubatureVolumeTet3D.okl | 4 +- solvers/cns/okl/cnsCubatureVolumeTri2D.okl | 4 +- solvers/cns/okl/cnsGradSurfaceHex3D.okl | 4 +- solvers/cns/okl/cnsGradSurfaceQuad2D.okl | 5 +- solvers/cns/okl/cnsGradSurfaceTet3D.okl | 3 +- solvers/cns/okl/cnsGradSurfaceTri2D.okl | 3 +- solvers/cns/okl/cnsGradVolumeHex3D.okl | 3 +- solvers/cns/okl/cnsGradVolumeQuad2D.okl | 3 +- solvers/cns/okl/cnsGradVolumeTet3D.okl | 3 +- solvers/cns/okl/cnsGradVolumeTri2D.okl | 3 +- solvers/cns/okl/cnsInitialCondition2D.okl | 2 +- solvers/cns/okl/cnsInitialCondition3D.okl | 2 +- .../okl/cnsIsothermalCubatureSurfaceHex3D.okl | 23 +- .../cnsIsothermalCubatureSurfaceQuad2D.okl | 7 +- .../okl/cnsIsothermalCubatureSurfaceTet3D.okl | 8 +- .../okl/cnsIsothermalCubatureSurfaceTri2D.okl | 4 +- .../okl/cnsIsothermalCubatureVolumeHex3D.okl | 14 +- .../okl/cnsIsothermalCubatureVolumeQuad2D.okl | 9 +- .../okl/cnsIsothermalCubatureVolumeTet3D.okl | 4 +- .../okl/cnsIsothermalCubatureVolumeTri2D.okl | 4 +- solvers/cns/okl/cnsIsothermalSurfaceHex3D.okl | 7 +- .../cns/okl/cnsIsothermalSurfaceQuad2D.okl | 5 +- solvers/cns/okl/cnsIsothermalSurfaceTet3D.okl | 3 +- solvers/cns/okl/cnsIsothermalSurfaceTri2D.okl | 3 +- solvers/cns/okl/cnsIsothermalVolumeHex3D.okl | 3 +- solvers/cns/okl/cnsIsothermalVolumeQuad2D.okl | 3 +- solvers/cns/okl/cnsIsothermalVolumeTet3D.okl | 3 +- solvers/cns/okl/cnsIsothermalVolumeTri2D.okl | 3 +- solvers/cns/okl/cnsMaxWaveSpeedHex3D.okl | 2 +- solvers/cns/okl/cnsMaxWaveSpeedQuad2D.okl | 4 +- solvers/cns/okl/cnsMaxWaveSpeedTet3D.okl | 4 +- solvers/cns/okl/cnsMaxWaveSpeedTri2D.okl | 4 +- solvers/cns/okl/cnsSurfaceHex3D.okl | 7 +- solvers/cns/okl/cnsSurfaceQuad2D.okl | 5 +- solvers/cns/okl/cnsSurfaceQuad3D.okl | 370 ++-- solvers/cns/okl/cnsSurfaceTet3D.okl | 3 +- solvers/cns/okl/cnsSurfaceTri2D.okl | 3 +- solvers/cns/okl/cnsVolumeHex3D.okl | 3 +- solvers/cns/okl/cnsVolumeQuad2D.okl | 5 +- solvers/cns/okl/cnsVolumeQuad3D.okl | 194 +- solvers/cns/okl/cnsVolumeTet3D.okl | 3 +- solvers/cns/okl/cnsVolumeTri2D.okl | 3 +- solvers/cns/okl/cnsVorticityHex3D.okl | 137 +- solvers/cns/okl/cnsVorticityQuad2D.okl | 5 +- solvers/cns/okl/cnsVorticityQuad3D.okl | 65 +- solvers/cns/okl/cnsVorticityTet3D.okl | 3 +- solvers/cns/okl/cnsVorticityTri2D.okl | 3 +- solvers/cns/src/cnsPlotFields.cpp | 59 +- solvers/cns/src/cnsReport.cpp | 8 +- solvers/cns/src/cnsRun.cpp | 10 +- solvers/cns/src/cnsSettings.cpp | 23 +- solvers/cns/src/cnsSetup.cpp | 253 +-- solvers/cns/src/cnsStep.cpp | 19 +- solvers/elliptic/data/ellipticBoundary2D.h | 2 +- solvers/elliptic/data/ellipticBoundary3D.h | 2 +- solvers/elliptic/data/ellipticHomogeneous2D.h | 2 +- solvers/elliptic/data/ellipticHomogeneous3D.h | 2 +- solvers/elliptic/data/ellipticSine2D.h | 2 +- solvers/elliptic/data/ellipticSine3D.h | 2 +- solvers/elliptic/elliptic.hpp | 115 +- solvers/elliptic/ellipticMain.cpp | 66 +- solvers/elliptic/ellipticPrecon.hpp | 214 +- solvers/elliptic/makefile | 17 +- solvers/elliptic/okl/ellipticAddBCHex3D.okl | 2 +- solvers/elliptic/okl/ellipticAddBCQuad2D.okl | 4 +- solvers/elliptic/okl/ellipticAddBCQuad3D.okl | 2 +- solvers/elliptic/okl/ellipticAddBCTet3D.okl | 2 +- solvers/elliptic/okl/ellipticAddBCTri2D.okl | 2 +- solvers/elliptic/okl/ellipticAxHex3D.okl | 38 +- solvers/elliptic/okl/ellipticAxIpdgHex3D.okl | 20 +- solvers/elliptic/okl/ellipticAxIpdgQuad2D.okl | 10 +- solvers/elliptic/okl/ellipticAxIpdgQuad3D.okl | 10 +- solvers/elliptic/okl/ellipticAxIpdgTet3D.okl | 10 +- solvers/elliptic/okl/ellipticAxIpdgTri2D.okl | 10 +- solvers/elliptic/okl/ellipticAxIpdgTri3D.okl | 568 +++-- solvers/elliptic/okl/ellipticAxQuad2D.okl | 18 +- solvers/elliptic/okl/ellipticAxQuad3D.okl | 68 +- solvers/elliptic/okl/ellipticAxTet3D.okl | 16 +- solvers/elliptic/okl/ellipticAxTri2D.okl | 10 +- solvers/elliptic/okl/ellipticAxTri3D.okl | 10 +- .../elliptic/okl/ellipticCubatureAxHex3D.okl | 406 ++-- .../elliptic/okl/ellipticGradientHex3D.okl | 4 +- .../elliptic/okl/ellipticGradientQuad2D.okl | 4 +- .../elliptic/okl/ellipticGradientQuad3D.okl | 4 +- .../elliptic/okl/ellipticGradientTet3D.okl | 4 +- .../elliptic/okl/ellipticGradientTri2D.okl | 5 +- .../elliptic/okl/ellipticGradientTri3D.okl | 46 +- solvers/elliptic/okl/ellipticMask.okl | 2 +- solvers/elliptic/okl/ellipticPatchSolver.okl | 3 +- .../okl/ellipticPreconBlockJacobi.okl | 6 +- .../okl/ellipticPreconCoarsenHex3D.okl | 14 +- .../okl/ellipticPreconCoarsenQuad2D.okl | 8 +- .../okl/ellipticPreconCoarsenTet3D.okl | 4 +- .../okl/ellipticPreconCoarsenTri2D.okl | 5 +- .../okl/ellipticPreconProlongateHex3D.okl | 14 +- .../okl/ellipticPreconProlongateQuad2D.okl | 8 +- .../okl/ellipticPreconProlongateTet3D.okl | 4 +- .../okl/ellipticPreconProlongateTri2D.okl | 5 +- solvers/elliptic/okl/ellipticRhsBCHex3D.okl | 14 +- .../elliptic/okl/ellipticRhsBCIpdgHex3D.okl | 11 +- .../elliptic/okl/ellipticRhsBCIpdgQuad2D.okl | 6 +- .../elliptic/okl/ellipticRhsBCIpdgTet3D.okl | 6 +- .../elliptic/okl/ellipticRhsBCIpdgTri2D.okl | 6 +- solvers/elliptic/okl/ellipticRhsBCQuad2D.okl | 13 +- solvers/elliptic/okl/ellipticRhsBCQuad3D.okl | 23 +- solvers/elliptic/okl/ellipticRhsBCTet3D.okl | 7 +- solvers/elliptic/okl/ellipticRhsBCTri2D.okl | 7 +- solvers/elliptic/okl/ellipticRhsHex3D.okl | 7 +- solvers/elliptic/okl/ellipticRhsQuad2D.okl | 7 +- solvers/elliptic/okl/ellipticRhsQuad3D.okl | 7 +- solvers/elliptic/okl/ellipticRhsTet3D.okl | 8 +- solvers/elliptic/okl/ellipticRhsTri2D.okl | 8 +- solvers/elliptic/okl/ellipticSEMFEMAnterp.okl | 3 +- solvers/elliptic/okl/ellipticSEMFEMInterp.okl | 3 +- .../elliptic/src/ellipticBoundarySetup.cpp | 142 +- .../src/ellipticBuildOperatorDiagonal.cpp | 349 ++- .../ellipticBuildOperatorMatrixContinuous.cpp | 439 ++-- .../src/ellipticBuildOperatorMatrixIpdg.cpp | 701 +++--- solvers/elliptic/src/ellipticOperator.cpp | 43 +- solvers/elliptic/src/ellipticPlotFields.cpp | 40 +- solvers/elliptic/src/ellipticPreconJacobi.cpp | 19 +- .../elliptic/src/ellipticPreconMassMatrix.cpp | 60 +- .../elliptic/src/ellipticPreconMultiGrid.cpp | 76 +- .../src/ellipticPreconMultiGridLevel.cpp | 286 ++- solvers/elliptic/src/ellipticPreconOAS.cpp | 138 +- .../elliptic/src/ellipticPreconParAlmond.cpp | 20 +- solvers/elliptic/src/ellipticPreconSEMFEM.cpp | 199 +- solvers/elliptic/src/ellipticRun.cpp | 135 +- solvers/elliptic/src/ellipticSettings.cpp | 23 +- solvers/elliptic/src/ellipticSetup.cpp | 155 +- .../elliptic/src/ellipticSetupNewDegree.cpp | 108 +- .../elliptic/src/ellipticSetupRingPatch.cpp | 46 +- solvers/elliptic/src/ellipticSolve.cpp | 6 +- solvers/elliptic/src/ellipticZeroMean.cpp | 9 +- solvers/fokkerPlanck/data/fpeLinear2D.h | 4 +- solvers/fokkerPlanck/data/fpeLinear3D.h | 4 +- solvers/fokkerPlanck/fpe.hpp | 100 +- solvers/fokkerPlanck/fpeMain.cpp | 49 +- solvers/fokkerPlanck/makefile | 23 +- .../fokkerPlanck/okl/fpeAdvectionHex3D.okl | 21 +- .../fokkerPlanck/okl/fpeAdvectionQuad2D.okl | 6 +- .../fokkerPlanck/okl/fpeAdvectionTet3D.okl | 4 +- .../fokkerPlanck/okl/fpeAdvectionTri2D.okl | 4 +- .../okl/fpeCubatureAdvectionHex3D.okl | 35 +- .../okl/fpeCubatureAdvectionQuad2D.okl | 13 +- .../okl/fpeCubatureAdvectionTet3D.okl | 6 +- .../okl/fpeCubatureAdvectionTri2D.okl | 6 +- .../fokkerPlanck/okl/fpeDiffusionHex3D.okl | 11 +- .../fokkerPlanck/okl/fpeDiffusionQuad2D.okl | 12 +- .../fokkerPlanck/okl/fpeDiffusionRhsHex3D.okl | 32 +- .../okl/fpeDiffusionRhsQuad2D.okl | 13 +- .../fokkerPlanck/okl/fpeDiffusionRhsTet3D.okl | 8 +- .../fokkerPlanck/okl/fpeDiffusionRhsTri2D.okl | 8 +- .../fokkerPlanck/okl/fpeDiffusionTet3D.okl | 5 +- .../fokkerPlanck/okl/fpeDiffusionTri2D.okl | 5 +- solvers/fokkerPlanck/okl/fpeGradientHex3D.okl | 5 +- .../fokkerPlanck/okl/fpeGradientQuad2D.okl | 5 +- solvers/fokkerPlanck/okl/fpeGradientTet3D.okl | 5 +- solvers/fokkerPlanck/okl/fpeGradientTri2D.okl | 3 +- .../okl/fpeInitialCondition2D.okl | 2 +- .../okl/fpeInitialCondition3D.okl | 2 +- .../fokkerPlanck/okl/fpeMaxWaveSpeedHex3D.okl | 2 +- .../okl/fpeMaxWaveSpeedQuad2D.okl | 2 +- .../fokkerPlanck/okl/fpeMaxWaveSpeedTet3D.okl | 2 +- .../fokkerPlanck/okl/fpeMaxWaveSpeedTri2D.okl | 2 +- solvers/fokkerPlanck/src/fpePlotFields.cpp | 40 +- solvers/fokkerPlanck/src/fpeReport.cpp | 8 +- solvers/fokkerPlanck/src/fpeRun.cpp | 12 +- solvers/fokkerPlanck/src/fpeSettings.cpp | 37 +- solvers/fokkerPlanck/src/fpeSetup.cpp | 277 +-- solvers/fokkerPlanck/src/fpeStep.cpp | 49 +- solvers/fokkerPlanck/src/fpeSubcycle.cpp | 17 +- solvers/gradient/data/gradientCos2D.h | 2 +- solvers/gradient/data/gradientCos3D.h | 2 +- solvers/gradient/gradient.hpp | 37 +- solvers/gradient/gradientMain.cpp | 49 +- solvers/gradient/makefile | 15 +- .../okl/gradientInitialCondition2D.okl | 2 +- .../okl/gradientInitialCondition3D.okl | 2 +- solvers/gradient/okl/gradientVolumeHex3D.okl | 20 +- solvers/gradient/okl/gradientVolumeQuad2D.okl | 4 +- solvers/gradient/okl/gradientVolumeTet3D.okl | 381 ++-- solvers/gradient/okl/gradientVolumeTri2D.okl | 295 ++- solvers/gradient/src/gradientPlotFields.cpp | 44 +- solvers/gradient/src/gradientReport.cpp | 4 +- solvers/gradient/src/gradientRun.cpp | 4 +- solvers/gradient/src/gradientSettings.cpp | 25 +- solvers/gradient/src/gradientSetup.cpp | 82 +- solvers/ins/data/insBeltrami3D.h | 2 +- solvers/ins/data/insUniform2D.h | 2 +- solvers/ins/data/insUniform3D.h | 2 +- solvers/ins/data/insVortex2D.h | 2 +- solvers/ins/ins.hpp | 174 +- solvers/ins/insMain.cpp | 49 +- solvers/ins/makefile | 23 +- solvers/ins/okl/insAdvectionHex3D.okl | 11 +- solvers/ins/okl/insAdvectionQuad2D.okl | 6 +- solvers/ins/okl/insAdvectionQuad3D.okl | 18 +- solvers/ins/okl/insAdvectionTet3D.okl | 4 +- solvers/ins/okl/insAdvectionTri2D.okl | 4 +- solvers/ins/okl/insConstrainQuad3D.okl | 2 +- solvers/ins/okl/insCubatureAdvectionHex3D.okl | 53 +- .../ins/okl/insCubatureAdvectionQuad2D.okl | 14 +- solvers/ins/okl/insCubatureAdvectionTet3D.okl | 6 +- solvers/ins/okl/insCubatureAdvectionTri2D.okl | 6 +- solvers/ins/okl/insDiffusionHex3D.okl | 12 +- solvers/ins/okl/insDiffusionQuad2D.okl | 7 +- solvers/ins/okl/insDiffusionQuad3D.okl | 75 +- solvers/ins/okl/insDiffusionTet3D.okl | 6 +- solvers/ins/okl/insDiffusionTri2D.okl | 6 +- solvers/ins/okl/insDivergenceHex3D.okl | 12 +- solvers/ins/okl/insDivergenceQuad2D.okl | 6 +- solvers/ins/okl/insDivergenceQuad3D.okl | 110 +- solvers/ins/okl/insDivergenceTet3D.okl | 4 +- solvers/ins/okl/insDivergenceTri2D.okl | 4 +- solvers/ins/okl/insGradientHex3D.okl | 10 +- solvers/ins/okl/insGradientQuad2D.okl | 6 +- solvers/ins/okl/insGradientQuad3D.okl | 188 +- solvers/ins/okl/insGradientTet3D.okl | 4 +- solvers/ins/okl/insGradientTri2D.okl | 7 +- solvers/ins/okl/insInitialCondition2D.okl | 2 +- solvers/ins/okl/insInitialCondition3D.okl | 2 +- solvers/ins/okl/insMaxWaveSpeedHex3D.okl | 2 +- solvers/ins/okl/insMaxWaveSpeedQuad2D.okl | 2 +- solvers/ins/okl/insMaxWaveSpeedTet3D.okl | 2 +- solvers/ins/okl/insMaxWaveSpeedTri2D.okl | 2 +- .../ins/okl/insPressureIncrementRhsHex3D.okl | 23 +- .../ins/okl/insPressureIncrementRhsQuad2D.okl | 15 +- .../ins/okl/insPressureIncrementRhsTet3D.okl | 18 +- .../ins/okl/insPressureIncrementRhsTri2D.okl | 18 +- solvers/ins/okl/insPressureRhsHex3D.okl | 32 +- solvers/ins/okl/insPressureRhsQuad2D.okl | 22 +- solvers/ins/okl/insPressureRhsQuad3D.okl | 13 +- solvers/ins/okl/insPressureRhsTet3D.okl | 19 +- solvers/ins/okl/insPressureRhsTri2D.okl | 19 +- solvers/ins/okl/insSubcycleAdvection.okl | 2 +- solvers/ins/okl/insSubcycleAdvectionHex3D.okl | 11 +- .../ins/okl/insSubcycleAdvectionQuad2D.okl | 6 +- .../ins/okl/insSubcycleAdvectionQuad3D.okl | 22 +- solvers/ins/okl/insSubcycleAdvectionTet3D.okl | 4 +- solvers/ins/okl/insSubcycleAdvectionTri2D.okl | 4 +- .../okl/insSubcycleCubatureAdvectionHex3D.okl | 67 +- .../insSubcycleCubatureAdvectionQuad2D.okl | 14 +- .../okl/insSubcycleCubatureAdvectionTet3D.okl | 6 +- .../okl/insSubcycleCubatureAdvectionTri2D.okl | 13 +- solvers/ins/okl/insVelocityGradientHex3D.okl | 3 +- solvers/ins/okl/insVelocityGradientQuad2D.okl | 5 +- solvers/ins/okl/insVelocityGradientQuad3D.okl | 3 +- solvers/ins/okl/insVelocityGradientTet3D.okl | 3 +- solvers/ins/okl/insVelocityGradientTri2D.okl | 5 +- solvers/ins/okl/insVelocityRhsHex3D.okl | 39 +- solvers/ins/okl/insVelocityRhsQuad2D.okl | 28 +- solvers/ins/okl/insVelocityRhsQuad3D.okl | 66 +- solvers/ins/okl/insVelocityRhsTet3D.okl | 19 +- solvers/ins/okl/insVelocityRhsTri2D.okl | 19 +- solvers/ins/okl/insVorticityHex3D.okl | 6 +- solvers/ins/okl/insVorticityQuad2D.okl | 5 +- solvers/ins/okl/insVorticityQuad3D.okl | 3 +- solvers/ins/okl/insVorticityTet3D.okl | 4 +- solvers/ins/okl/insVorticityTri2D.okl | 3 +- solvers/ins/src/insAdvection.cpp | 10 +- solvers/ins/src/insDiffusion.cpp | 12 +- solvers/ins/src/insDivergence.cpp | 10 +- solvers/ins/src/insGradient.cpp | 10 +- solvers/ins/src/insPlotFields.cpp | 52 +- solvers/ins/src/insPressureIncrementSolve.cpp | 17 +- solvers/ins/src/insPressureSolve.cpp | 17 +- solvers/ins/src/insReport.cpp | 8 +- solvers/ins/src/insRun.cpp | 12 +- solvers/ins/src/insSettings.cpp | 57 +- solvers/ins/src/insSetup.cpp | 634 +++--- solvers/ins/src/insStep.cpp | 44 +- solvers/ins/src/insSubcycle.cpp | 23 +- solvers/ins/src/insVelocitySolve.cpp | 39 +- solvers/lbs/data/lbsGaussian2D.h | 2 +- solvers/lbs/data/lbsGaussian3D.h | 2 +- solvers/lbs/data/lbsUniform2D.h | 2 +- solvers/lbs/data/lbsUniform3D.h | 2 +- solvers/lbs/lbs.hpp | 96 +- solvers/lbs/lbsMain.cpp | 49 +- solvers/lbs/makefile | 25 +- solvers/lbs/okl/lbsCollisionHex3D.okl | 320 +-- solvers/lbs/okl/lbsCollisionQuad2D.okl | 356 +-- solvers/lbs/okl/lbsCollisionTet3D.okl | 240 +- solvers/lbs/okl/lbsCollisionTri2D.okl | 210 +- solvers/lbs/okl/lbsInitialCondition2D.okl | 2 +- solvers/lbs/okl/lbsInitialCondition3D.okl | 2 +- solvers/lbs/okl/lbsSurfaceHex3D.okl | 42 +- solvers/lbs/okl/lbsSurfaceQuad2D.okl | 95 +- solvers/lbs/okl/lbsSurfaceTet3D.okl | 186 +- solvers/lbs/okl/lbsSurfaceTri2D.okl | 6 +- solvers/lbs/okl/lbsVolumeHex3D.okl | 68 +- solvers/lbs/okl/lbsVolumeQuad2D.okl | 98 +- solvers/lbs/okl/lbsVolumeTet3D.okl | 98 +- solvers/lbs/okl/lbsVolumeTri2D.okl | 6 +- solvers/lbs/okl/lbsVorticityHex3D.okl | 6 +- solvers/lbs/okl/lbsVorticityQuad2D.okl | 6 +- solvers/lbs/okl/lbsVorticityTet3D.okl | 4 +- solvers/lbs/okl/lbsVorticityTri2D.okl | 4 +- solvers/lbs/src/lbsLatticeSetup.cpp | 20 +- solvers/lbs/src/lbsPlotFields.cpp | 52 +- solvers/lbs/src/lbsPmlSetup.cpp | 22 +- solvers/lbs/src/lbsReport.cpp | 8 +- solvers/lbs/src/lbsRun.cpp | 12 +- solvers/lbs/src/lbsSettings.cpp | 21 +- solvers/lbs/src/lbsSetup.cpp | 185 +- solvers/lbs/src/lbsStep.cpp | 62 +- test/makefile | 2 +- test/test.py | 22 +- test/testAcoustics.py | 2 +- test/testAdvection.py | 2 +- test/testBns.py | 2 +- test/testCns.py | 2 +- test/testElliptic.py | 6 +- test/testFokkerPlanck.py | 2 +- test/testGradient.py | 6 +- test/testInitialGuess.py | 2 +- test/testIns.py | 2 +- test/testLbs.py | 2 +- test/testLinearSolver.py | 2 +- test/testMesh.py | 4 +- test/testParAdogs.py | 96 + test/testParAlmond.py | 4 +- test/testTimeStepper.py | 4 +- 788 files changed, 30979 insertions(+), 38945 deletions(-) create mode 100644 .github/CodeDiagram.png delete mode 100644 3rdParty/gslib/.travis.yml delete mode 100644 3rdParty/gslib/LICENSE delete mode 100644 3rdParty/gslib/Makefile delete mode 100644 3rdParty/gslib/README.md delete mode 100644 3rdParty/gslib/RELEASE.md delete mode 100755 3rdParty/gslib/cdep.py delete mode 100644 3rdParty/gslib/makefile.cdep delete mode 100755 3rdParty/gslib/odep_info.py delete mode 100644 3rdParty/gslib/src/c99.h delete mode 100644 3rdParty/gslib/src/comm.c delete mode 100644 3rdParty/gslib/src/comm.h delete mode 100644 3rdParty/gslib/src/crs.h delete mode 100644 3rdParty/gslib/src/crystal.c delete mode 100644 3rdParty/gslib/src/crystal.h delete mode 100644 3rdParty/gslib/src/fail.c delete mode 100644 3rdParty/gslib/src/fail.h delete mode 100644 3rdParty/gslib/src/fcrystal.c delete mode 100644 3rdParty/gslib/src/findpts.c delete mode 100644 3rdParty/gslib/src/findpts.h delete mode 100644 3rdParty/gslib/src/findpts_el.h delete mode 100644 3rdParty/gslib/src/findpts_el_2.c delete mode 100644 3rdParty/gslib/src/findpts_el_3.c delete mode 100644 3rdParty/gslib/src/findpts_imp.h delete mode 100644 3rdParty/gslib/src/findpts_local.c delete mode 100644 3rdParty/gslib/src/findpts_local.h delete mode 100644 3rdParty/gslib/src/findpts_local_imp.h delete mode 100644 3rdParty/gslib/src/gen_poly_imp.c delete mode 100644 3rdParty/gslib/src/gs.c delete mode 100644 3rdParty/gslib/src/gs.h delete mode 100644 3rdParty/gslib/src/gs_defs.h delete mode 100644 3rdParty/gslib/src/gs_local.c delete mode 100644 3rdParty/gslib/src/gs_local.h delete mode 100644 3rdParty/gslib/src/gslib.h delete mode 100644 3rdParty/gslib/src/lob_bnd.c delete mode 100644 3rdParty/gslib/src/lob_bnd.h delete mode 100644 3rdParty/gslib/src/mem.h delete mode 100644 3rdParty/gslib/src/name.h delete mode 100644 3rdParty/gslib/src/obbox.c delete mode 100644 3rdParty/gslib/src/obbox.h delete mode 100644 3rdParty/gslib/src/poly.c delete mode 100644 3rdParty/gslib/src/poly.h delete mode 100644 3rdParty/gslib/src/poly_imp.h delete mode 100644 3rdParty/gslib/src/rand_elt_test.c delete mode 100644 3rdParty/gslib/src/rand_elt_test.h delete mode 100644 3rdParty/gslib/src/sarray_sort.c delete mode 100644 3rdParty/gslib/src/sarray_sort.h delete mode 100644 3rdParty/gslib/src/sarray_transfer.c delete mode 100644 3rdParty/gslib/src/sarray_transfer.h delete mode 100644 3rdParty/gslib/src/sort.c delete mode 100644 3rdParty/gslib/src/sort.h delete mode 100644 3rdParty/gslib/src/sort_imp.h delete mode 100644 3rdParty/gslib/src/tensor.c delete mode 100644 3rdParty/gslib/src/tensor.h delete mode 100644 3rdParty/gslib/src/types.h delete mode 100644 3rdParty/gslib/tests/comm_test.c delete mode 100644 3rdParty/gslib/tests/crystal_test.c delete mode 100644 3rdParty/gslib/tests/findpts_el_2_test.c delete mode 100644 3rdParty/gslib/tests/findpts_el_2_test2.c delete mode 100644 3rdParty/gslib/tests/findpts_el_3_test.c delete mode 100644 3rdParty/gslib/tests/findpts_el_3_test2.c delete mode 100644 3rdParty/gslib/tests/findpts_local_test.c delete mode 100644 3rdParty/gslib/tests/findpts_test.c delete mode 100644 3rdParty/gslib/tests/fortran/f-igs.f delete mode 100644 3rdParty/gslib/tests/gs_test.c delete mode 100644 3rdParty/gslib/tests/gs_test_gop_blocking.c delete mode 100644 3rdParty/gslib/tests/gs_test_gop_nonblocking.c delete mode 100644 3rdParty/gslib/tests/gs_test_old.c delete mode 100644 3rdParty/gslib/tests/gs_unique_test.c delete mode 100644 3rdParty/gslib/tests/lob_bnd_test.c delete mode 100644 3rdParty/gslib/tests/obbox_test.c delete mode 100644 3rdParty/gslib/tests/poly_test.c delete mode 100755 3rdParty/gslib/tests/run_tests.sh delete mode 100644 3rdParty/gslib/tests/sarray_sort_test.c delete mode 100644 3rdParty/gslib/tests/sarray_transfer_test.c delete mode 100644 3rdParty/gslib/tests/sort_test.c delete mode 100644 3rdParty/gslib/tests/sort_test2.c create mode 100644 include/comm.hpp create mode 100644 include/memory.hpp delete mode 100644 include/mesh/mesh2D.hpp delete mode 100644 include/mesh/mesh3D.hpp delete mode 100644 include/mesh/meshDefines3D.h mode change 100644 => 100755 include/ogs.hpp create mode 100644 include/ogs/ogsBase.hpp delete mode 100644 include/ogs/ogsDefs.h create mode 100644 include/ogs/ogsExchange.hpp delete mode 100644 include/ogs/ogsKernels.hpp create mode 100644 include/ogs/ogsOperator.hpp create mode 100644 include/ogs/ogsUtils.hpp rename libs/mesh/meshPlotInterpTri3D.cpp => include/operator.hpp (73%) rename libs/mesh/meshPartitionStatistics.cpp => include/parAdogs.hpp (52%) create mode 100644 include/parAdogs/parAdogsGraph.hpp create mode 100644 include/parAdogs/parAdogsMatrix.hpp create mode 100644 include/parAdogs/parAdogsMultigrid.hpp rename include/{mesh/meshDefines2D.h => parAdogs/parAdogsPartition.hpp} (67%) delete mode 100644 include/parAlmond/parAlmondMultigrid.hpp create mode 100644 include/timer.hpp create mode 100644 libs/core/comm.cpp create mode 100644 libs/core/exception.cpp delete mode 100644 libs/core/factor.cpp delete mode 100644 libs/core/matrixEig.cpp delete mode 100644 libs/core/matrixRightSolve.cpp rename libs/{mesh/meshOccaSetupQuad2D.cpp => core/memory.cpp} (65%) delete mode 100644 libs/core/parallelSort.cpp create mode 100644 libs/core/rankDecomp.cpp rename solvers/ins/src/insBoundarySetup.cpp => libs/core/timer.cpp (52%) rename libs/{core/matrixConditionNumber.cpp => linAlg/linAlgMatrixConditionNumber.cpp} (52%) create mode 100644 libs/linAlg/linAlgMatrixEig.cpp rename libs/{core/matrixInverse.cpp => linAlg/linAlgMatrixInverse.cpp} (56%) create mode 100644 libs/linAlg/linAlgMatrixRightSolve.cpp rename libs/{core/matrixTranspose.cpp => linAlg/linAlgMatrixTranspose.cpp} (61%) delete mode 100644 libs/mesh/mesh.cpp create mode 100644 libs/mesh/meshConnectFaceNodes.cpp delete mode 100644 libs/mesh/meshConnectFaceNodes2D.cpp delete mode 100644 libs/mesh/meshConnectFaceNodes3D.cpp create mode 100644 libs/mesh/meshConnectFaceVertices.cpp create mode 100644 libs/mesh/meshConnectNodes.cpp rename libs/mesh/{meshParallelGatherScatterSetup.cpp => meshGatherScatterSetup.cpp} (53%) delete mode 100644 libs/mesh/meshGeometricPartition2D.cpp delete mode 100644 libs/mesh/meshGeometricPartition3D.cpp delete mode 100644 libs/mesh/meshOccaSetup.cpp delete mode 100644 libs/mesh/meshOccaSetup2D.cpp delete mode 100644 libs/mesh/meshOccaSetup3D.cpp delete mode 100644 libs/mesh/meshOccaSetupHex3D.cpp delete mode 100644 libs/mesh/meshOccaSetupTet3D.cpp delete mode 100644 libs/mesh/meshOccaSetupTri2D.cpp delete mode 100644 libs/mesh/meshOccaSetupTri3D.cpp delete mode 100644 libs/mesh/meshParallelConnectNodes.cpp delete mode 100644 libs/mesh/meshParallelConnectOpt.cpp delete mode 100644 libs/mesh/meshParallelReaderQuad3D.cpp delete mode 100644 libs/mesh/meshParallelReaderTri3D.cpp create mode 100644 libs/mesh/meshPartition.cpp delete mode 100644 libs/mesh/meshPlotInterpQuad3D.cpp rename libs/mesh/{meshParallelReaderHex3D.cpp => meshReadGmshHex3D.cpp} (60%) rename libs/mesh/{meshParallelReaderQuad2D.cpp => meshReadGmshQuad2D.cpp} (60%) create mode 100644 libs/mesh/meshReadGmshQuad3D.cpp rename libs/mesh/{meshParallelReaderTet3D.cpp => meshReadGmshTet3D.cpp} (57%) rename libs/mesh/{meshParallelReaderTri2D.cpp => meshReadGmshTri2D.cpp} (60%) create mode 100644 libs/mesh/meshReadGmshTri3D.cpp create mode 100644 libs/mesh/meshSetElementType.cpp delete mode 100644 libs/ogs/gs.cpp delete mode 100644 libs/ogs/hostGather.cpp delete mode 100644 libs/ogs/hostGatherScatter.cpp delete mode 100644 libs/ogs/hostScatter.cpp delete mode 100644 libs/ogs/occaGather.cpp delete mode 100644 libs/ogs/occaGatherScatter.cpp delete mode 100644 libs/ogs/occaGatheredHaloExchange.cpp delete mode 100644 libs/ogs/occaScatter.cpp create mode 100644 libs/ogs/ogsAllToAll.cpp create mode 100644 libs/ogs/ogsAuto.cpp create mode 100644 libs/ogs/ogsCrystalRouter.cpp create mode 100644 libs/ogs/ogsHalo.cpp delete mode 100644 libs/ogs/ogsKernels.cpp create mode 100644 libs/ogs/ogsOperator.cpp create mode 100644 libs/ogs/ogsPairwise.cpp create mode 100644 libs/ogs/ogsUtils.cpp delete mode 100644 libs/ogs/okl/gatherScatter.okl create mode 100644 libs/ogs/okl/ogsKernels.okl create mode 100644 libs/parAdogs/parAdogsConnect.cpp create mode 100644 libs/parAdogs/parAdogsCuthillMckee.cpp create mode 100644 libs/parAdogs/parAdogsFiedlerVector.cpp create mode 100644 libs/parAdogs/parAdogsGraph.cpp create mode 100644 libs/parAdogs/parAdogsInertialBipartition.cpp create mode 100644 libs/parAdogs/parAdogsInertialPartition.cpp create mode 100644 libs/parAdogs/parAdogsMatrix.cpp create mode 100644 libs/parAdogs/parAdogsMeshPartition.cpp create mode 100644 libs/parAdogs/parAdogsMultigrid.cpp create mode 100644 libs/parAdogs/parAdogsMultigridAggregate.cpp create mode 100644 libs/parAdogs/parAdogsMultigridCoarseSolver.cpp create mode 100644 libs/parAdogs/parAdogsMultigridLaplacian.cpp create mode 100644 libs/parAdogs/parAdogsMultigridSetup.cpp create mode 100644 libs/parAdogs/parAdogsMultigridSmooth.cpp create mode 100644 libs/parAdogs/parAdogsMultigridSmoothPrologator.cpp create mode 100644 libs/parAdogs/parAdogsMultigridSpMM.cpp create mode 100644 libs/parAdogs/parAdogsMultigridTentativeProlongator.cpp create mode 100644 libs/parAdogs/parAdogsMultigridTranspose.cpp create mode 100644 libs/parAdogs/parAdogsParallelPivot.cpp create mode 100644 libs/parAdogs/parAdogsRefine.cpp rename libs/{mesh/meshOccaSetupQuad3D.cpp => parAdogs/parAdogsSettings.cpp} (65%) create mode 100644 libs/parAdogs/parAdogsSolve.cpp create mode 100644 libs/parAdogs/parAdogsSpectralBipartition.cpp create mode 100644 libs/parAdogs/parAdogsSpectralPartition.cpp create mode 100644 libs/timeStepper/timeStepper.cpp mode change 100644 => 100755 test/testLbs.py create mode 100755 test/testParAdogs.py diff --git a/.github/CodeDiagram.png b/.github/CodeDiagram.png new file mode 100644 index 0000000000000000000000000000000000000000..649e53acc60b74d298043b4a9d729dc10cdb3188 GIT binary patch literal 43595 zcmeFZcT|&Iw=YUZkN_Hrh7xQP0f|baNC)Y?NRuX_5Tq&i3X;%4K)Mu>5{eKyC{3iJ zpdd;Qf*6Vj1d!em%6$U(>ig~e?R)mPXPkTQ8Rrj%Le{gMHRqaZ&iVVzxe}_YtpcKl z(36pofz(u$^vTF5G=U$BBQ(HY8h`Td0RNER*H=*>EB?ee1N=gDM_x;wjI1o`=(Y_t z@H?HWs_A_)GR79t4|$sl<`x-Q-l&?Ayx~L3`Bd6!E~D4GWt6cB^w*BPCD5+X^&V@yR##Yd?W+~aUav3 zU4E2~yxo9X*}nIw@QUN@Z>F;&jz8w^o@Ss4XtIW#cru8#xd}T#`VDhjRP*q!M*hK} zq)!=Wvedg^7xzE5Hl%~_9zJwF<^!xV>7wF=hV6S_?xZ8Xze?XJT%G%3aOKM_u)>j3 zfziR_yWcOTZ)tugX{>5&xD{a->_AwZxwknMqEY2M@*N$(7~657&QLjmQ$R;{?div4 zi8yB9!E75}Sw9^=D~ZKMN{wA=85$I?)7dMUQ|UhSwR<}YiB>6Y!L>_-+1qWvOPCcP ztec7Hd)rgzWbZdovm0`1zf`@I6e;oJ#1`JT;k<#l*+)jwtby)qqjw$<+l|$Cc^wSl zn)Y*9&>hTc#r>6jqTlY+Q{ggllUh3Fn6!ApxoYIW<-bTDG537&`>eM7@Inu4EQnuf zV!~FO^k){<&6JJ01*DM`Rg|da!*EWKhvpIcQnOIk9%00R*>h}~H&{k>7B;c2z#@+b zNG@Joly<}Q-h%LYzXsQrG$Nkv3v1uF(YU7+W}u`#95_zRNj5n68U3qULlOOcn#PGC&VebK8^eG zb|!kB=8a)4a7`ZWx@ zU@}5qG6>N=b>CPA))7na zceHUh&Dd9VsF#1N#?!Y_Y~<&}Z9?wn2D>bGeJhBTNOl=XBY&#y`lFfMepHU>=?KL$ zGeM#xJpw!NTj2<%hu?!AybkLd6b-Xfo$@`Dd734dPLP_+;k|*3S9=ggpV9Uu9&an^ zwfy)drEvE-Fmd4CZdg!`U#ZC8z)KWzG?CH_w_yBmJGt0yhtODZw0R*j?9!m8SbeT%6 zG{A1?$isant@m2)MNQR};eI+wS-2M85ucll*`qqV*NKQL+m+jNYP}W6@~JADewE~e zm0q}36?r9jWUme_|H1}NQhB6domO^7(jJRb_G$ibS)_~UVD{^ebURHvPa!zHThLPy zwY;iFdp)eg+lG{H>?|hd4{YxgX!}_jUn{9f7G>rx`H6D@+m%Oh-^X;iT2;^>OGS@` z_@rdN{$;C@@ixmH~O6)+YFM$?sa%jUOV|RzNu)sfT#u-<09k-J!lyZDYHE{Mw^pAzB z2&+Nm8W-zkr*rt~min1WBR2c=BII|_y6u5|A(+JfQXA@@JqC%!NZk#Y((6>ScV4X} zLQ3%grG?HRlbP;x&6Qz(!gAFf2#dX^bPYUTsW%hFL%5tq0eozff4wS27 z0LyuamznMisL9edxC~hdGTAqE zyDLL$9pp|emt@?O)+|CLgQeu&kZj$~N@yeyo#HdkBE8pN%(@)IZRx1dF<40m6m4MA z!@j@drY0|~aR)6uUr9UH3kWsfd8(8A(6Q<9w`7A8Wd+b&4GbuzVJwmWwzUkV+XM?# zw%E#>^(ozsS4rO%eOeaTcK&Fo)5$=oMwGDD+YDqUN5A63ZL%3Ji&e9&+w!x)*mvr^ zFV#TcMPfE_W0r`8DXo-R$#OX%Anhh?S;WS*jg1go(G^F~x`V^%NX$x)z0!>Tt=#us zr?%oq%H?mNa*tkK8m6dWg33abw0QKuCwqii0+Cr!o6A^cXWQYw808%F!E1pzn06AW0A@+f zT1>q}?I~#)xaoA0W3m}(EBhX6OQy#wGDkVIvQh!H`GWaNWP)IDw5$HtaY_GYsM5)r zes@U|!L2}P{i48O5u|FLl-VO>t*iSu%)4iROFBx4F`B>{^sw-ezK= z1hY>#ou^A5kCr?A3J-k$jB367UQ5Pzyqn`8aHjf;`sJXJPl%*)&1$CXtshiqu7bOR ziTt{)VhV_)J|ThYj|Lfu;J;AIWTXRgW=_n3`_B*pL|;IYaH_HP$1({$?r{E$jM8PH z`?p>MS*K^{QDUZ^b1?T?_ALdXpSXDS!GhK@84L!Y5Z)fue+j^lYa)6yiXdM|0KX#n zGE7rXP>O6OLtf1`Y1Krb=LFzBhHZS@p*CKYXs3Ue!SA@7PTKk-EZGHlk1vbP!!`Rq z2FEA%!5m(f%Jcx)g5HOJ#%my^2`Q2(P@-EYTbo5_t);f>U%)koLSPp1mi9`qkR1Lq zfuayzeM6GGfCuk+-VG^KSf#02R-J#J$%TL62^fP(Fd2q!xfc$)eXY08-te-DUa&$( zs{_2)65bNHU7Es83jzzA-#1sMq~<1dmsT$wU&%o-pJ>BOnrBj_G~>!N$O_S8Bwzg^ z$ScG0W#*LdTdyt2eD8rBq047p0t0V~pUR*uZ?{2-9`oY(| zaw&`?zHa6Jd4Vs zWg&KMA8`T7I)|N?1G~#qL||cmYe}_ihQX^zIsl26E{S^Ce2@O;)$fA4Cgja-$aHFv3Xn^#MV;W2Mt5Cd=aIk8&sxAQm4WHe+9mJ5$C z5_6|`SW@F7D`Tz~S$X>Jm>$Hhp1;>IDzm)Mi*dsYiTge}u_fue8EETQ$r`CKCgbk? z_)S|(R2iS9j-)byN0##uo^E~LEr*6_J0ukzB_rRmYpEHlJkF>7?s%4uL15++1tN2p z*j@HX_p3r+KK*Q>NB8xIaRZQ5lCc2cIX+<#VnDJnwuoYOE@<;b+6}gzokudF(x==e zl#=d5NL-iLe*3`i4VmX|H9FAtX0=ns#NCH7cMs5N7>|@VZBlD3gWzD3Kg;ZK-n4BleWW; z4Mc#g;F)h9?>P7@_OieDu{Bx2gp$a1Xz14F$6)AS9|D6a$DQAs`PKufJ|m@Lnqi4y z*S`y~5b=VljNz>KIleW>4D6Hy+N$;;og?5f(@e{HR=uSWNaCcNI4{DYL6dW+%`5Wa zU^>ae^PQJ`Ve?X3yk{qzb3p5|TcDDb?@&%=W*;gu&B~JaJ)9i~!I6T6;PtEdPT{wo zljX7FgIJy`4}^bvuD6k#EKrwTL>yLGu-j%>QFz4)w;V%74u`;HtcRf-@vcvlxE#~n z(6XooM-br-$@A}Tt*>mUqr_fwkcS`rfKKm~o$_dL8w`)1i27Y3a+R+`w{|*lUF_p4 zXR6tUonrSlsZ+-R%A;9Gv+L?Q|kF^}}S9~pFpMxGm!p4u>O<7w!`5GC$*Kpdlk zQT8VtYa33 z0O|C*T)nm9car43+8XO$g&2usU;Nh~lW&=Sk{8JiYNZ?A95Hk2QF(|cT--eBgg~D2 zj-m>*BO^>E<_%bG{!F$qo;BZDpCy}@?Yf_(l$vGc`|F6SF8yb-=D?t`Wn2(Q(y=|R zQ`?dEP{|c$LFHZV?JWd5U37^4q78dC624A8vIA~eR%SXY{rnVt8$?TuQkQ=lKl?S{ zEmYgy7Gp~hfp!5R78o@mbcn|*b6Y=(d0=Kac>Cl}jxcst&e}yQ?9cX= zi%Hy9u@9E~ z+x!RrBH5-Vo(=ajFqo?Kj40GCSU?lIu7Gezini*kCZE^nS@!h%ueHPrhwkEItW|8-N}|N|XIaOa+1s!*0&5*QY3=7fg4Xas~!Lc_q2@Q>%dt z>~ECIq7#DEhzlEB^D|4QM_i0Mh{*hN|6E2`&Sg!4Kxf-|EVnrMP|f2^HY|?_CUMAr zq7n$t!o_|;mJ7@{SChsYO7R+qB;U%31$J3Sha}&hZM04VkL_A7JR9-Vfy0a$C#CJ( zhQVsI3_gub3j<^W2b1~(=|)c%#!C+!rMzH!=fz{MQx#`OXf2PypfC4LboONCD3$%} zX?lE+mzrW0o_e~X5PMa+Q@MLp0ii(x-UcUmSHr>9P9K1D=y1#^B-3X%|LljT%$cCR z1R!4s%ekbP)YnyoY0{*9!`x82<}I5tT}a35#M7B>Qjx;cBWl$YSh%$*{n6!Ym&zgf zR#B|$<6iAPIN|+AFt-b(dQf3G?E2=UY$v%L76gtoqERY3F@5&7*5fg&cpfc2xq>G- zTXEUijqiKQOGWr6unwYKKL42S5Lx~PiX^2pW&6vVzg`a~&n#SX_Se2aaKNRL6@6e* znm?n7eXxBgf`Q<8h=W;x9q`))fFKDs9uQ3X%jtm$B-wXiOB)8hXM>+Lm@&uag3-sd zt#_$raw+H7i5pc}?N?4u7aSN`K{wiRnf6!GcR(_cl3~#b8;T^8E-Rm#uXyFYXL0gg zmE{6Lgvq(a&!t7K{tQcea_Bg8Jk}l^b?Bd!1HC@5|I*SQQmq1rS7Zs@-8*C=qUrn>OkWQDcXmR8? zT9A@|BC(&2QMPPeCv89`)>ks37U4UwA5Tr3;HUdE>PN)^yh(D{2UHlz9cTgByFs0{ zz8qh^>*?l74+hEZE$}mK^lt8+D+Se1j&}KVX7!3YS9HwT{=kTEScV8aM|>)$0Q zR>&)($yxg)WNvx$CV&|~#0w1x9PMM17P#_@-L?>UuuoWD&^2Go9{j~NKd2kr=e*y@ zIW@)I{6$!QSv#0yj-rBECk*t9^TvjA?ZahYVyXW#@&6)KD88Dp>kt+bwwVl)+TF}} zVY{=tAzP;>x!60NB(^~H3^q2}qa)aGIp#lZaH^V!NQ3t+2i)j##|Es!P;u7W^At>i zf4Bsvg3ZkfRUe}99aGpkDzcsP4ELIg+XA|tt==0j zwYo6R;XR{a{@gAAWlrK zIE6fmv*VM&xM<}7^%-`I@WTU}{glm~#6tEeP$t_`CzEn!{#E>F4_vg~D*YVV_*BeBMlwq-3JKfDA(`>J>i^u1sHeJ&L5bZcyW)BS2Iaf0r92ICX9 zE`ll(vE(+x7VI+2xF#aqEtu{H$fGpEz3<(Xu7)s!fNUV83$g9G!rHmAcj@y2(@@WZ zyjdMCyjf;HD(cz%xb-EJ32z=JARzOF8gVp@g#++~8JDN8X;D4#&zZ56NTO>H(#^`E zp>C|*6ckM^E--@e?56LuJkApcy{xHjAC^Dq04?)rC`>+MA^nnpMb4^Qb~G0YS$#(e z=3v&_+6k$f=z93G*zq_Ebmgepe!WLBz_L$ivp|jTP*{|KKfv8_O?h^6C+ZPYy;EJ7 zA+8t^6g8;;&bCssF3O(qEL%4fEZvX6+J;FIrVqvg2m`m&!*m|_4f*rE3kLlAG=rC) zUt^eNQt@pCV_PzAy;C6GcOnHmpBH>mX5=8t2%97!+gzGXD|k!U8IV&Tnn{mX$odNb ze@qd&x+ITuymamz-*1m4T4UKq)?}h|k_KnZ!{x9!QcW7`ndFg7_sv4F3dpmA70ZVl>t$_g9*NWI~V+Yw#iND>;7=hB$#^ksxyw`4i- z1Oex9GoQqGc+(Rc=SvdyrIbs>{OtqE&ph1N*t5!$)vMiDI-@?> zb0SCUd~;#ELyt@da)tqo+fODn=GGMWkfRM&%ptN#K|YA*!Q}v6_$cbpgHHD?(kW$= ztmYMl`qu!1G5uvk>z9kr5^MkFi^0l0BDP~X{E``Ccf`9*rgI-M<1aZk?0%n~eb<~m zD7h@7O8={l)c%|wJVv=Qy`?Xl_rdpqbtM_9?REdkY*j|oq+wi5k*p%X8_VNV+fe18 z`KoWd2{y~{;}x+vNM)JGYr}2zzDBv%(NNNI{~@057kM9R86kI8z6cvHY+|LrGhVp4 zX(OtuvcL7s=b@sniRSfndX`xMsQrqMP#N|p7mw@Zi8~b^G-<`(K}=$g+a*Q+(78>| zdv5hIq7)%=VzhK2JDqCn>sW5Q6{ruOXG1*dmz5&uKfPky^h$*-67Aqz04xl6;WVD> z;iTvqp~n^oPW#C&tJrl%I2U}Tr)YOeWbP)1-PQ3w<>#2=`+&^5jg{Tlv?vk#BwI{f z9xxm1)F2~QZ^_r!OJp6+sEiJNur115nGtO4VQrzqI$~p z0^N%UPJ2_`beZvQ!T5^1VAlHdd|8;;JAU{0OXl0OxDi9;JGw{JHVT3TKCI2WC-Mjk zM*R(x=e&OV&E81g;Jo_Lqz-zZk=5~&YTY{9uG8At-%Qe=biZ(kZmoyZ-%5&{L9&QzmMouf3p+31zOn-66H(9 z?YXMktHXUeCestI#V=1*ffnm~A3>$;v+i+G6K?H;w71+Lj6XNI(p%RIF3@-~jE}wH zb=w37p=u(%&c4`6f)$1~yL6P$x*uOPzR(=1Q>C~a54xL`;w(5=RcslE#_nN}K|$Q7cuBR`I3UFT?N*15_4UJ$4$s#X#Kxqvd)}zV z(g$jUzMs zB~#R!Yh^Ckc};I+7O%GRKxJdN*R47ni9;^ZI5Syc%pvG)KPr8N^OcLUaCu=oyZ=4@ zC|tjkxn#GauVW5YV^#n#khQoQ(7mCm4H*pUzcmp#t>EP2&?I)@#1NbF@gBhr0S}ebnmzmT`4jQ& zP>NwQS?@Ng+cYzTyy0a2=1T9vWj!#or|?62d&|W$pq=#&(*mkNu^zJJo#jS+1NUEp zP%T=_)YLg@k3)#+>9>+cb8J+~6bqhSYgq~jB*;rhhj{%gbo8CYVD(FkAO4&Gs_^|K z(1aBa^621~W)0FC@F|T>nIuzTk1eNMa$#a=+ ztHW<4l#|nR>MS$Z#@8J!T*6B`7l*2*{;R!dyd=ivNy?b?D8E;w4!~@#=Ks0wmgK>B z*Vv{B>ZeTI%bO!E9bRop!>h(rg`ubKcC?)@aX?A^E0J3_h_YIE|9$|D?q$nH{j)irym zZJ5t{5lyh%w_Yh~WWN@pT*RUU#T5?dDDD?U&DVVioiykyEW zN{yJS49ovhAaC&Fm5_-(TUxyWL2=Yk+>H$2WU=&9FLqGYS=vaCl%(Gt!&VWdpiGr| z0}bmu9Y|flQs|)zxx<4`j=uFTevBY4L=Qg)xPucMIR5zhM1k?=)f%m;9j}{AJD)ow zkqtd)Ri7Q@i!6E8FvCGfcb&QqHp;hLD5>`rr0y2yr|mw5psL(aV{WiL-M6zzGP0-L zQln6CWlZkyG)K?sAZr8DkdN?e)l`6cLoG$`_mWtfJ^rlut*E>nUs86tA6tDx8(;Ub zp7WhwOd;FjG~e_nowpLZ_BkUy_ck?gsn>`Dl^iq>)r`B-S3z6TPgnHs^`k%plB@D+)sxVgI|xv z|8%X;G8Wz`KkJ&}2;Jvl479so@xNbb)&9=9C>`NfEH!o~p^bbEZRvID9lCqXA36D2 z_V87|v-kIZJo|;&+rR&X54AGgRGwbvaXHz)30D9?m5M~KrnQ`r+KZxJeA57#i@J|8 zEj)kbvx~+`GcxN!PZ8^dozfQ*{P;`@3xQs*v9M>Aq4hWCx{aK-BfX0_j<3HSuueZn zuJ@Z7e?v&1;7Q1!{_$c+V-XR&Ejyswn?@22^|&-vCW^$nw246rAj^1iRnV`3E;K?d7+m3nsS~@7vyN z^yYyopSf|&f2>vOy-DmUB1x{g zftm9tfv(xM2tIvSY~Ak<{e!DOvnNql^y^ul_kLyljQOZAdohIEqGAaY2?39^%IGOK z{2<%ukt%s7z`Q(Gf&_9r{rk9_o-v?hmd8qgOjN5Hy z$Ys5jTk)@kbZxT>^lI>dm!?B-w89+4{24e*2$B+jqCk}Gq}Y*@SnY2wfe%)Z{1V?m zBkLav#jq#Qz_nnfRSSTg6QeWmONa;CxPG1>8u~y%TNzUZ@Q{kcp4(07ruzSqS$KuN z7V#MJ#0;oDQspWiSx}w9M#Yp!? zP~VYr$Pj@%8;afh$Ofjpb1qI;F=f4ZyNjAIXOkw1&5`p4Ob20E`=F=xH_rp0>J#9k z$i*e_#~_*5-ZmS=?ZSrG7Tc@BiSJ-D?U6r-JATi4RNOXau)TjJua=#X!@aC+$@Y*O zzvcox=rsTzwBG*aX3orBgF28SS#k=bc|@5I=XKdyDeXPs@}>-N$rqWEPw#P^o5OwX zWq+$jJ6Yx-z3(&dzh~a>7A|JL`cIqOZ#drfD?ktN@8d>dg8_gQKx#w&ZEgE)hX67nbu#~- zi6JDA_`g^cK&7B^XsKm6jh5*WpP~MZZ-oKIht@r|8(+_ei1j~v224uZ` z8Y|u0bzRnCeAPd`fGWA*7d1!cUVXk$bOjASW3Tzm49(kWewIAW1y1@>58S;zT-p)< zI`t2pfzDqX>@qj(`?y~li68?#06qwaAdtG^N!63_{|V)l|NayiKZ9qIb>{CPBn0}T zW;AtIvLAoXm<6gzeN~LX(aJ%|e4R9KoiJ7Inn>WRDRA~lIb6Yb>MJdGc>pfx*f=on zEBVg2)3G7NSwKTSGc8;v8H7kD9ZwDmvT|3KUp#lV_ zQ;di-QI0+-fzzJTl7Ndo;sy-v|JN=dInXU+r{3M-!Zh*B<%TaabICqS zZ^PBGgYAxN1mxU`#~a2$ru93M9&C~YKc=?l@Ua6+z3qIRTKhW6;q+XcmW_|(@^%=G zI#-kNd7D5!JNwnUG4J*>DYQ)(?NlqUCAv6-7*p4SiVvP0TCyhRjg@+-EyRYLn8Kmap(c+k7?IMAF!Agp z`*{Wp)_W?@M||Jj@%;1+FlWGv$?QJ-4cjLQ!zW%9ZY$I9G9uf;V$&my_@=Sg3vJ(r zqW6tSKxLalE-_+4W@*BJmY0Hy2jMxf_~Xex6!;3li~JbD#J+j97$|Qz{8F4*sv{TE z)hrwJ%L(^2i$Mpbb?mlkDja4eU~a>n`8>vB)MJ0EpF zXrgO&Gzz4)@thB!b0)LV_5IF|^hV64E}q9>C0h-p+crb*hhOq4ie_|`7P2@4ns?wZ zV{;0zQ!Iz-hxq)wmpQ&e^y3KE;cq7P?($QP~DvD#Cdfv=S ziyjXgG=oTt36IcSMTW*20S{N>Q-n2^y4R=lzmM@UP#@tj%Ra?qr^|NwtxWlobr9vn z_ESugK$yk@1_?fM2_ z!KgPiQ1pE6Ovum6cXgtUP3}ArbJ`L@b}x(LHUX5!SQ!&Z7-AgiB6#|)|2`;&Ld+DE zMuub71?f*?qMV*?%#w3WD{g?g<*n=Am!rOp;&Jb7d<@_A(*&Fjn5g>u3`O|0w;Q;Z zuJjtRkD+r2M`_P9{otwJQ-f?ns! za542@$O~*X8U=Xg$;b2xE@WS;2N`K~Sb5rQ&0QYswd#$Tff)kr`~DMBug^Mn=vNQ!v>~&SHYpP=>BMldq-lRkMZaMCC z(nR8Z9lBL7GtV%~N3vQag2i>|jODdI9KMKe>jZZtFeb2>F`6I?g1_;duHsz!Z1!{+08xkj4k_Z6e+9V7fCgaEk41yoyg>X^>L?2TI_c$o^grTN;KZIA<+S z;CV*y1p`ewce9EUA}Fu|clqdQ)MrX+r+3}=Y`i#8vYA7A<&n<00_lHQxkB-;$xbLP zU(vS5_BiST%_a6p_ud{qO64!g8ZUX*hfHOwG>s8mlgd(fC6A{&_$QzC`+?o(p#2@W z_4>M+{BS?7c4{_Y_r=xgD9Iyy9`JQFIQll-nzyj|PH9hmMuDo^CV^%EwYpS=QfR$` zC*NB#86tx0%47vaq^fxGr3k)KbFwMobc73Dlc4}t+At?H~rf`DurI7_2H^*(P`y} z4g|=>{P4-w3P|Q~_NE0o*JTUK z#oh1ueJES@`)06vRd5Wost4~uQU>E%i0ZSvbvWHC`929-%~#gPa{a0wiR5crqLIL? zbWs;Y#{aggqWe)!H)I=baBsGn^sdS*ZJ3hrf=}?W_+*v>_iJc>TN99Qgo3whpsO{(UX*6vKKB}( z=@3Pyo_BhWFF;mW{U=Gb+VqG^GXMc&b2`D71Ji^{r9 zjS=4;$2+vw$q;peE885|FS(iLAn825O|&D|)TwAwsSas2GTttpCwPVS-bqH;U*+i& za6eHQSLvNlzd0q!ShohH4=3J%vkJCLhC+e%H?Wb}ZcPeaGAnb%LWq3NTU z3|H0C5-GEKJaew`Lq#;(d2!?fu&p0U_)NdG9_CmFy|TX?MQTG#oR^nNnYN3G%8$mRqF+e8?_qC{X)K$O&L@V~ zSPfAQRUU3M2}gJ@HNFj+J)SJ>8w&EQ5Iq~g2)AMw*9^Nhw4568H+a|@KUOTaHJ$Bu z$;R@PTqipvBnJe*m_uM7!VOcezCgf`gQ;}S1g+LWY2A4-TQ}ue%1`}96Yr?vC>f4_ zJXUb74(1G68PW8tpA~_ z@ZV0M`&Wy5CtChxhjzMfg`#TKtAgJ1qK*#Fm~XbYrHMj3pG^K&*fvE<_0H+2(4Yw^ z2hrsd9d17R*X8l%w7^U~mYw&Pc~V1bdD#k_6eMG%wn;bE|F?v;|7u2tJQp}zHD|s~ z@UcYf;8dN*d|CW*ua_y|Em?-S$MTOK zl-d>z3qs>95sz!Gy*V{Y{^|K)`lUivSO0MQ+HGzJN^BvnO~&O1obp2 zVmJH|c3!5jzSQdD$?G!w`8LhcffM;yDN_*8Yjv1h059`hT)NuzoUHMq$=dRFjbYT? zo!w6Ws=ob9eDy@w1H}Q#6tQ2ixqFsF43@Vf5__yMJLJ)^xMPtZ*_9-Q@CRu<{=YK$ zA~8vu*=mp|$SiNIy)7l(!eS_Kxp89;E2Pq52nG6$O4Q_c0KbsW3_qFo-fd#6b#Y~x zDnNEUD?MRcCQm7&O}CumCHH9Up`-kv_EZ@OGs$9L2#UvjX+&lytI2+xraf7B+QcK) z|B8LmU1>gYAhVy1&>pm@`6g0P-%7Y8*XF!k-rf!Cr~O0Jw_3_I%LW?q)9qxuQgut2 z|LMz8l*7Y-ORsfOg^(^-Mbszs)>);KygrkBA3MNYC1tYkje!tHpiJspZ}Ka*39A0^ z{y3G4(PmWF;u|~H=nxKd+J|?|(|7f(jN@0q8hz)J$5tJ6;J{gK!6${11V>gK~>uN7IS<9@rNzZ3Su3^g!UQay0jkNr?vq0=;fJ zAer^CM9MDM^7fKx@1h@BgJT0Mfly+Hj*NW84_-^UI3o`IY5e&Vz&@-{v_67|ka=`P znSIMLm9r8$q%-mqzTV2FFpR-p3H0}>_oQ=?%9byGPm7`4zGTlnxq7?rIrHSsJ@slI z*1~UnyCAw%US*kO-%7vq-A=z37c5rW3Ici?L|hEdn2y1TbIA3_WAxdYpX-nU`_e6Ly~hu zmRFh{EjGW-whFHRe{!W1WC45ufrksiY*57K?ltc9sCs-|xRd>ei`9x@h}`fy9iNn_ zArTb55_e_&#S|T}TL%AHUUF!4)}00DNIArIM!sZ);gKBVGrMF!@$=sK#zkd$$Pwz< z01k4yZI0`-P+ zTsf|S){h#t z?aHj(2(IaKXHLULQXtEIWsLOa6^@WKP}Ut&Y*O}nxl2~)fPsFKl3~U$qQkzCjcFB* zSJ8ODE$cB8?Xy}l3mn5^7fojL0yOVXx7|EMynOlH)qZLv;U(n0qN6(8a$Fa zEpiM+n_f*wKY7W%s`Y*sEHt|X^>&P@y~|p>QBcw|XlwdRPq&NDrS5fib1mILu{xcu zdgmbcw3MDP_cDC(OK?pso5J?$QFevCgiM_KOcSTbqI zz6o#&Yr>$e%x~<|xu~S>4}$qlf>3~oDwD~ZNRvzcsnIcdqB||hrZ#^` zJ?JOkf*Q%N_1X+w#y`2JrUB<%7y5x<|9p3?d=>ha&a_+5Mlv8`u+wPb3f*VDuc(T> z)z!xNRYz0nue?`Uon40kn zmt@!5ndAQ>h@wyDJ?K`vxWyyQ5+9#4ovXo>BHR~Z-_IqufsW9^A04G_k{&J;BN;+gQ0acx@q~dUiAbiLQ;G3Ro|{J6HnT^CXeov9|fNYM)(F zlnd`WzYvfdAk-}CDZa23RDDF+Dal?*U{QBt-MHV*V zITCZ*frL~yxg69qUnYQvx7N<&yO`=0%^=ZEAsa!N8!@s?Hp7y9yU4sazlQNFto*`r z#I(fAWufMNUo$@>Nfv2BQegI!aa*%gt8<;Qo1mYW@n)(3eF zNZR0fo~*bLubCx+f}MGteadwtCWE?oOpxkz#@a)2<{ zV8uN%AVEIlGy43}H9p0sy~tBq3m5lfx@3rJtbJ`2ib(Vnmy;>!K0Q##d5W>k7jv5u zI~!HTfu-|IxZqtXGD2G^ywf%U9{TL6+5ccI%{Vr4g~z#pKdQErd%a)^BDYbkvgZ9y z1)QGkR%?VDabJE(6Meetq-Y7{GiqMTr=cW^w0MJQVn>5g8~u{`?3+Yrv&It@&GXqC zpl$)~ESLa4i4>|#Ucsjud`fW90qe+ZeraN`sw^hZ1PUn?S#1)@wc8e@MuMV*JGUH| zfV&8j*-BjCQ(S5X)x($EsrImAfu53IOu>%F{-ttQ*ydB48FkVZB9JZC{?hT=zmx&5 z$Uz+%ImaX^q%L($^f_b~O4FOuh-_EyQ-LPmFV>?| zF_=&rh1|V`f;*upQX$L=0|9O}SRK?iPO;eLr6A^;F>Rarf3+A6Ww^ltnMr@q7rb}U z>NKfdt#2$|&#ZvBc40z8iKGHNeH*)iYSDY(3k;3`e21#V*d}mIlJ)fykz=Idex4ww z)sa&brxTaXohDtG^az?bdMf++_e=FkR6sF1yO^}7zOWmuiH}J__pwHb1V)?1bZj0! z1{A}iYyRJR`PtY+rbi~)n_rJDFF8YnEesZ46g`M}Rqt*l@Ht;m75)!3#F-N;^lBol zJDA)PQNs(C3cZVOjH03ua6pQ z`AFBjA+%}3S+i$RpgM2J|!$F zsoEy@SPH$Ma*ByEi!n)6(R^CN&^&v^&(R@*gH+D8=y}~uwOg;|)>u&q80UTgE(X*%8ZaB;_pY63iMF`wa00e3zd1mWj~DY{|dA9a9lMA&o6_ibOSYiqb#4L1>egR|naX~nw|W**d|_5Rh)EZwvrF&(^rXL=f`mm;803}e z6b*I&(6xS#sZx@}>$xs*=AKP!-EQ)Cgr0V0i~ozd_Y7+?>-tAWnSpVnjKd&=76k#N z6GS?Uq97390i-t(0wMy^rA9?TrAY~dt^z@%X=qZTMj?eFU1|hE2`vx;Apw&AtuxR2 zKIh9h=l!4ST<1C;_`;n#*?aB1`dYtr@u_@O0+Q4DM^h>E1nu3^Yq#&D{?}boVSOO7 z6dYG!fz?u;E}S_&=2dfaNEv)2`7MzkS0v`@0?_hDvR!ce*SHKZt_;{i)w2|7T{odi zs;6EmJx}nt z*?Dz|>%Z?*Xu`h4*h#O%8OGjpD|O`hqwB60^(8_#btguGO&!YaVOA`ja)n+0e$T(h zy`(a}feg9jHB^7%`h&aV&KuKhzK#h?WbJc)?h6u%M<%*X{sG z7@kgtenPINafZhzY1Mdaw9$R|`^)R9ZM#n|{>LYT|1-3oI-FSP+h0th+gjzN{^v zG1>3t$+_P)b(16_96ui!KjDW&S(PYL;hcTgE%W4Ytsl$suEY@6?gMpZPY|#a`(%Jet0=nIfQKyj8dKy~=NJ{2rY78RU!GpJ8-fPMxs z-P6AXWz1VqJNKulIi2KVq!c{I_|f;TO65K{7W>=5Z=Jr!0Gwph#PJ#XG6+<3ulQ(e zi%jPUK@aIPPRtcf^YtIN3&uaOe>V6f{ud835JTU5L@MjwpwBl^|6BOa7qOpyz(Xfk z%iKi%4G*pQ8ZcRYU`q2EBmSR$=?%W2IL5Tx{#t$d?T(#MG+wsg`=%dx6$CV$>kUV4 z(;nxk`x4kZ67tmOv9_MWj4rORm2XD;3V_e?_&l_tB))a+-F#ZeVxBKSEU4Yu6!rtn z+VQ?Nf2hP_d&%LNFJgG@{u4%AN9U=6*e%F7SwS7=DH~QAmJzf^L zvaR4TW%(ZQ_WDoVcYZu>`up)k?C=v)f8?C}57=#M@=dDvRp4JQ{yq|Er!T4+YX2Z{ zpE)jyl1=;@i#x>|c<~c}#w~isM>q7YR+=JF&Vj*VA4Sf`M(u zDE=0Q;3vrJDTKKIdE=FS6a4MqQR*#fvA54ia@@qT@D1R znI&4}OIhjdSGO~}#L#y?=sEbiwS@3|MlT#+_$=&^`GL`ZOLa&^h}DWKM9v&M)7xDR zg+$IMI=uAS?>-CkSA5@TO4_f;BJJZR38nL0INb4gY3cOCTByXkdcBsV2wF7B9Rg2R zMtf5Q2$@F;@N;pRSy%Q0lI?kw@DvRycHYnl*l2l zcRJ_b3X8Y%5-a81KycE0eD|I4xMdMa4Cn6newrHOTUSO1j!&8_tjV9VnwXYVQ4;4q zTI=fW_ips9adDha>%n~ciIA_wrLb>ulDX{3PMO0PmF)Xzan1Qkr14p8ZSrveymq$$ z!rG{U@YK#+J{**C`=8se0#YbEeRpNH60+8e$vk$5Ia?S1Hc62~6?hsOcRZtW`pWK% zjrVndj3CCJ;{XKKq`}uMqwW{VoHw;nhQYA(DX24?W{`I{&maz$Mr6MbE9*xLtg9A9Fwb^?8cq=ytWRm zgiw(@3)!piRo+2US`AO%Gkw{E5_$p63Z%Lt5BeI?!CM$oFz?RTSMAmUX*FKqHaN)U zE!=(TmBV}2J7Y~hpoLz9cJ9=;eSa-#-Zo@v1iN5}mBYPcHnKt{?V}FfRxIRp^JjDz zvXN^8{!iBGYjE+CPsBG*$mOk%;V**y5D5~7cl=14XSjmS|bl+JhKzm)Q4__F<7oR}crX!c~@^SYfsm|))&mey_@gq1rY4 zBcdv_c_df#7;9M&dZMlOga#P#+312$UPB9RxE394rWxf+$U$GHjyeq0$$a9ilpZAPpHF_EruSh57rq0mDLxW*0JKYWM}afhGpzW!SJel z$mq5;eP-Zz9NlfucUgasQQTHes)@wE7u(2)tA+oI+uab#J*8#In%+gyJE(lj5}uRz zu?;7AkG8-Z@z?>a7RrZXXzzgsgyKo?_98|Jt{sG2+oAXkHZ#n|yFo95f=KfF+F1>l z`FrbQgh&aVIPaR+{;jxf0dd2SgRcmE@2YUxbh@4lTtDWGqB8_DD4&p__s_}H%md=) z*#|iZ>Hb4Z4Q#!l5YKE^H}+UjLKn=hrUmIF&{LJsbF1L&vCeU~;_+4NAgx0M_fDFH zLzYJ5zBOTFJTZy4lhmfG)20|rIk^HW^2g!t@~U;`;-tFaf04a?T`fZo);iDtaI`nw z&*NEAcRFFjzn!@V1w{|#(U$FWafA#j`TwL?Ue%o5iJc;~uRT+UH9fDf|_EbTP&1-q=xnxLa7jIO| zV$UBzy?1Sk3)nAH0;4VmN?e!$y;lpFLX>)!BunOS*jR7vy>=e&gKW97aJT(hT()UC zUVw)`D=6-6C-2u8<}Kgs-C$YDB-lxg6nsq`nxqrIrgeaoBCTq8Sn6&b$=ez&K#YR^ zOO4$UQgC*Gt%WCe57w4s3hCZHjkZS}i3^)~`V~ z?XCXhu3WyXhA_lPmS(Onl`3ykdx+M%yF^`h((UZceh}#$;GF(N9vw>&&0~Mu4jID{ z7Yd2|8CzKW;eYbY+zoC3$!2gh368^Ex<{pWT1!aXuQQ3Z*f0^^ta&-` z38r=3LdeG%BXTe3pyap+J|VI4yU8&uwW#QNN}xC*vwhdo_iRL1*W6Iwsr9vco~*iJ z^jy80_r#c?vQa-XPyH_?^j#P$H)1oM{y>IE#``Hj4hBhtcY+-A8H1TT6enC%KSnT$ zNfyB2Rym20`RbpiqLRIr8!^v>l}Tpg`x{b5Zlomb`t#@toJb_p(KG+xV$>>I$$R;E zCnTP%1o7Q?X|`VYvP0#5n-~1f+98^w!R4JE9OwOQ_^{Aa=W8%>%~lP{Rl&Z*O4B#u zc7?YdP<9m-n8K)B_n-SUoa-B#V?W#;*y4Kz znY-8X9O@Z0B3J$~$TA|^cNYsF-Xu2KyGp6hY2{o^$+A+-Eo8)sazAJ60)(6^ho@8e zmsnrd)?1EyuSq0$_xQEpJ`NRH!keG8-<|M*Y_SXEWiq>oDi}p0l?khp?$or|c;C#8 z$T;wv$VNPPK?GA?E%LSnD&A0^0*Y-^bZn@$sSmy}LK_LePjjB>2kvLm{h5ipg@gT| zGHSIZKpAm`Lsa3}Hd1gyVbuI1$l)Vlc?~h?lPd0~IQz*w3oagoN?d@#Dt9Zip1{9I z#Dh+?V;h$2kCT%)>SlrOZbAKI0+@u)eZIBXpUoHXSyp-q!bO_UPELy{t)s@C;0eyy zNu*ruR_i-|_-af_n@Z+^^-kPYxSp~I^l*GPRWp3sX)9Hb_I?hIAlY{_wjsA6z9&z{ z7y0I6!gM1Gpm&4c7fBZE@SZVg!+$Fgm#XVWTAy>9d`~xo(f<{- z>|2s;=Nl*`&50_kHrqr}C~_$L-f=^P)xZ_0kZyaa$QCwfKK`|*D4?YZ2`Pe95 zC#3+7UB}kDc?Y|w65@uDfhm>k+KcLp5Fu{2X);UR%c~_;7mbgMy~Frji7~&FH8-A zpK&$ky8!@7d_!t!Ud=+`m!1d|14Mc-s|0VkS;N=eBJx~u$69B!zy>8-*sXG1Xx@b=Ym zdz$0h6$xhVF#()Flk>G7n+pb%lnLEg7OofmxH>fMx+>5l{6iczXuj!#?DcREzZ7Vf z4orf|Ylw-aO;njsgDE@il2DYQ+hlNhY;hD#4HQ4f51CC2RX8Ws?V|2Wd~CRW&cBu- z6;iKm+T7}O79<-f?&ul)CP^=f*#rsPt67g1X!Du1+$v`rT#3MNjDS@x^A1`HVd)68 z=Pim#Tq0Gbf{)dr==|o5P37A;HjgZRV_#3<5HsI1crRQ`1a-d+xVCuvaR#QFn2oo( zcB?F%dEA${6(fMaHYGYaI{?1j!b5>%an^v`!j?&KZML`S1^&O;a$p7-OX{HAU=&NG zjn)tnkEN4yZVz4IE(xnvbh1mTtEv4XD{1ERn8zJwm_PQeFH;^AjkiYAA-;Qo%Q7{Z z+J)a)tEGgolbKt)p;Su6IFD*nyZixy%o>cZ?n;!wi4lwd{AKODusX9pzueajs)<@7&Y+hJBh;@3(MqT}4wTCx>;q3l)JvgX={x(eY&1!9r8pn{&9LH1nU?3GSu4QO}&RU1}CuJ<;&*P zz<|0y+P9fD;@U6FIAUnbQ3Kzt%t(Tm`>D9KX$L|ICQPWMHmSkg;{tRQjGvx1UvwX{ zNF;@&RXY#83#2SKxaV-y?Y~^pFK<>io8(FK(;Q!YQAa#ICaxINz7<_=?Dd%-WWF%0 zo^+LzU{82YZOq}z!hEEYq}t#cQ7x-!#}K0Hso^w6Xd*!%MkU9q$XZ?u)kqr>cp|=4 zX+mDe^{C1AQoX?YkEGPN4o>83M54*lE|0?iZX!PQT$wgP#oaQAV!hq1`*~OgWyomI zi`|gB%pHEBrKv(4x*-zwh>}+0JyPQa2u@Pg6?0LlKi7HtI!7hnnb;J%#E=^|l=;RN z$@Tzj_M1#J9YcY8(@F*GZk>0Zidg!YvVAxPd;da;t8*zOQyt31tyMZa2C4{ZAT%3j zInw^qCqlTe4Hpkh))>Cqjk1jnaR zscM4(pKi)eA=N;_lK_<_^QQIuTr&nyUH}-WE$SdusD2^-E4nR(J*o+t$!sd9w4NV$ zB2E6h9sSgG#KM^)A%dEWCZ-T$a1VKS?94QM6y4ulPV4L}S<|SjQa5!8@<#3!zN;69 zAI_q~wg$~ojg#$L_ExM1mmV`A-9X3}*p7J7!r;(^=7(_mIor<{G9qcw%79${!zXOJY+q4DUE z*MsIHRj8cycG#y2LAX~uliattjY@9<0If!k@m9SWMCY3lq~D!T#@ky%R-DCPtmZNo zRB-SB0)&q~ej@7z<~mU~WNB^;Q{_w@+r$EcU z_Rx|T$|V|C&-rRujJ)vHrd7>r_Q92qHNF_aJyW;Bi#AyL3$$lGO^u?v;uW&7?rZ9g z?W~@W7v{t?ti`Xzf;NZj`&Da{K&bebmI(mvlpzQKfEy^PS86%gD3&o-~K#my+qsJ>xETLvEMqId&(u`vI_tOP}&b?Lonx ztkHp4wS%pd>d*r1`r*cA#mV;M6di^D!X{M=ILu6Ixc1f2!%*7{Vi9GVSFqFm`htd= zH`%V6KrlXxAec?*E}k4;Vk<37D(QQFi^g7yKQo3|D5b3za9oPvZ-Yq~AY5>y0-70L z@l+k!cP8M?$6@)nP7RaV)?3i>u=N|Wb1fkUW8K8Jldn9T736M@$YT!rt~yORb5vYl z=Og~?RsC?r@BMqng}66=JbK@M3x&$}Jlg8Kv$zHmD;rW(`|U)%uu`^Z$$?5n7Ngo2fI`HQCnGq z(@BAh2kKB82vn}E#N_t=OFcbpHfG54yvLsds&+9AsGIs4_EDm!FVzFz2`^aJX1N$J zc!eC-r>Kp6h{<$m^V0+!&VzH=1=+@FV;(VXsG@DZjN}WXuc~e&Blp97dm7FND-RHS zwC031CvAv)?=3hx0YwiAqN+3A)Oa+GofU|vGW^|4r)5GPj4dc8Hgsw zFvV1SrRf81F??AZt>3F$<0bh7ONV&e?@Me~dElSESMAuouMF7;O65$O3$jBidpKB> zlyKzqdsX~%$+TJs8ytiG$j#0)`&3oeA#d2pRHr56Y=k^lHxtv#^k+U>3uo{{d{%H~ zl&<7(e+N8=9bQI19FYfYnpdj^+(brjc7J0|gTHX1MLX@8lPV@USq<&%$yqB%{vPyh94&t`f1E@x~;(_I2tQx3G0Ht-}zHNVg%+*PK_{G9`L^WX4 z95uSi^GCf%o`&IH7Pg=%^e~yEhs>pf?m)M9ZlPc$=0m`x*7{exR%P#tkH|AbXt-#N6`rZ20}`gD`!%U6Yidh<`4#MhS7KOe}F8TT+KXa$h&x?@7K``;DUFsp4k=-qeOJ{GN)~+s35}$(tyoUhecY-KNTtSD~ zdONZ6ov+%5K`*4^4J~(BMe1ftTJe;WK~`?3OL>2!Fb?K02cxp!oPqpC1#&ewC3C(> z-Y`>l!AQykJ{KHCd6bG7K@PNAdt=Fl5!)6|;}L&N7#^lWpm&9TTU3nYU9zoN=LdR` zGGsTMzb$v{24DKRnKAXTx0UeFUzCA4k*7AVl*d{vy1c2i5#*+aCBJ*!)^p$z=?983 zDG&29UUeSZ(S@H&N*lk#UodDM{|zL2fS4*4Y$N(Ps{rG~uYZ@UvvHm&Y@FC#^+_~umRo~1BNxU|8c+i~1PdVM3a{C)+zcxJe zfQ`-=WH5-Z_h2AgHGT^z4j6Aydpb-?zX@?=M1s-x#mEbsg>8x}r#Bw02pp3)VHB!F z9>m1TSKyY6+@ki4Kv0Lp9C~!>PvuWJ9)HEFY7}0_)#^1Pur)eV^F6O_HJI3UDhxr@hx@3h>LswCs^p(S6K^cR_1PA$wf<W~LH7zM-_ zub$6-Z*LNAK(GxrU>l9FNAeE{q0Z^Uta9TCy*$L%3#t2B8%;ctnbUAXTW;5{w?t$! z;*$(c_v;M>r_cZH@0?5Q=T6BMY>G&rNBA$0Yewb9n|xjgdZfe12RqEZlWp2Y$JiT z2{x1t>eMcEeQ|NV(zoPUY0&V6}CmA!IS2^*HZ z#$3{Xd^_r!c^zp+RjtXoUs02#pYEg}cW^7L(zUxwhuNK1x`1>i7I`|$Ros>dG3Hpy zNPoV5D*uBjQw1*ll2?Gw)&2Wy=rH&i>0Wa2{xZq!znT)Kf%<;L@$v;C|H?(n;oERIp3r}Ju!YEAfeRV~hLnhM+hWJXC#o#aW)Dxv37 zd2Oo-TT^j(k37Zglf(0z2CuNho7@1%<8ta_mj4A&R1=UGN2+;^LZj}sSv_g5@|~1! z3G9F|7XcfN8t-y$L_}?N<053=fhyGUeyhSlK>>=0L@KY#*A`gJKU7BCQ*+nZzEZ$lb8Bh4u5uPMyXkVJID9eh>7MTj8g6qc%&>a!DgY3 z%qo(n6?r+6Q#EH^4KII1>QK%x@w+O7O7KbT-r%0@l1$-Z>Q|FTDP2#bL}WQv0u{i^ zL{cz}zMuE6u4wNH((0e=W;xTxtc?o(M}T;{U|;Q`P};fe=Jz?7RP33lJPrF8=#kNq zyviw0u+?a=2GewoYaeC zqQo1Au;o-wooNtC+K}F^x3yf4a5b(sIl(K4{}l+)nWwaoZJhRMm8HOUe1O1KP6y0B zT?sa)r0HN5emkXY-YhS!E9P=vs1GHbS|x%ac;MZpt}MEJ;~jDfJQ?)uF8k3EIlCYt zQymqSYg=e~%f)lW$=P!PuuQetiT7!Jb6p1<>ZavjKS#rL_gz>y9aCn<;wYVmO2tSX zra+5!!^(}jyE9?i3k{tDcvBZ$Y=cB+GQ}=WtCh{MJ6@{xPbl8|iGxSn70K~kFDdG} z(*>xTM4u`?XBtM|;7D(iqAG0AUUNHRq-UmvBvW#vW!;i?{m;P3bTWd@MW%JWwJF`2n>it1O2TwV z9deZV01jl&{5JHy-UXIEWKZnC*U0aq`1&tf<_Hwt9s+dc z;~P4O-b*haEQ|H$8ne^?B`Fe@To@4F zWq{5z^Pf1b>ZWOH`ppd==|jZ|3oq+M_XW5_cK)4i(6Fy@kuyr$dr}=gcCtJY2zK!& zR)+hVPy->a`^PS*%IrL@kc@If74MzM@X#${@4AVJ%C=+aW4$nT3;3B@^X+#>cR#A9 z;~RZjw?_Qbf-hzOz3;vT6Pp&o|f* z9~BU(d3UN}4>JSE9t`)h{tgktHbXis%nxdVjMtPwPM~j2U@&*(4J2z-y5+bFj8UIM zpt7~0qcs6NL0{tU5^O`r5h^)svyC!$|AqFFl!%JEb~3pGJ^vM_-0RVwtAl4~7epu> zB3%8#{lowkK^v(!v0o@dZgbeQL1N>f&1Xu#fkLSj6J2f4OWE#;G!a2Dhzh=uS$Jt` z&G3Y(d)xvnFYH4f4ObHJ%(RqRvkz`jaeZ@{9eijVvhhZlwv;myGDEH*YxbRDCLZ%} zWtOVBkq=xfGe#6N*Jf7$aWK|gbjqWxv7!*Z??mf+sRMVrwCdavx9m9X^~H9OdeJv$ zBTN<#WJ5YVS(DD=`{`0V2O$rDVj6$M(^TQQj@?%A{k(2+gTRwQv|UwU3rWkV1$7@2 z|G>KWx#z7LfQ7T3`k9!E|8^+wY@{IKTnF6^VgU}hO@$71N8?x4+X(OKs8Y6xd`caP z#lG*~Scqxbbb*`etM$kIwsWajqv|AE2($(Wb2IR0!*-WMH@dCUTb+sDEI{GS?eyji z$l=Qx*rxz^#%J@@`q!JIDNZzoKU`6mpLB^$=@~33bkYfe*dwnaqjN}}`GdA;MN@WZ z9UDa2#A?&{d-xVDVHG3e0iNR>4L89NH7>js^>?Uw1}ACSux9pB*7OJS%J;{#v1DGr z5yx){WK44yu$Ju)U3>P_SB~hg`-SwzW930CgWh0Pvn)=RD7uO{^bCXkQ^TY;hj8!! zBGMaDXjn7all8_c3GJC_^{xa=Rqo$tNabbjfM_fJb@sO!RTh;U&tU8JO7((~lihf# z5ObX8ay*cX%aiF>ZDf;VA-ijw98I5gyUGBb_Tk3`{khr>VF+!9NW_cWSgYcgrY6z} z`lhr1{+EPK8R1llY;F@h6R!1L8wBE2qteN!z;h;3Tb>}btq*1vx+pyz)7_3JVT!tE z%s6MlAfz9u3k!H7itugMUscjxLIFuJ{W??3u|7q0%myWG3TyS_z0qj`gUFBP!G6SLa~+C1gW=Fm8S?g1vfM7eW>zyK$y2HCaPI(roK^miGrKQCV(2Tifq(a&_lK z{)|fSw7YlJhLo<>5M9j|X)qfP1ixCd)GbQ5^t~y*tvJE902M4eyM}#gGLbl z4LzqfVR_YIvSx9Aj$WB@YBcug_c&!`QKSfjpFFz6x9y^~zx7^8i{PeIxLjQ|EihlL z6ca@adf+`DQ3MKb=7uEKu9>)X)`hH3A3vMJa3d9jwTl~Cl55J_K`ZiJ%FNd(60xa0 z#X|=34Yw(xAl(gvmc9H=$jz&oEuZsV_@i#iIOv3inc`KM>Eum)sQy9)wvF&evq+Jc zTU*^=BQy13Mq?!cD1pl@BVj$V^<~#&4}+%6*?0HBFxK&*NyZM>>L|V*fMoNXzFV;t zynkjqh9Cv{g0|s?hWJ!^BMVJ`vq$c>-zbV@FuceuOVgdYSE>=1!VAWh%Z+GnWPB^p z2=Mi6{^7EAua(EMv@d-n+2NBBO8S|$*fT)NJvbJ>g(#NdOxvA|qa@I1noT)5M@n*@ zB*hwa6-PWy(Odc%w6&iF8y@e*y%Xdikilz;LzqdI9C0#qV75O z$v}d>?MZtyi^uy@Dm-@t#QQO`T)}%e6@$oCjyx1^FUNh^fUJ%NLUsw$-ow*ygmBT8 zWAU6&#d!S^aak=AH!`*%H#R+DzELpcwsDK1q-n7sGp6MgR<3Re!!{sX9o3+if(dg= zi2=L5HuhB~)u7<=ay)8Hf8XvPAP@B5%o&Ht@_dZ7(a))*aq#D%%FmD-0tNT~qd>~I`@0(NWCbwUt z>5kH$Kzf}1UC7}uxIuhuyC~CBXLdWwoBdH6A#SYxPd?|>6}_FWrI6V4Ai`jP!CBf9 zcc~J<*uCWzgcSD(JxEtS6BqrMzOIa}P8&E`JFG zB9Ccd;_HNf>&OXvL02^Lb+~?h-PdV1!kHGQ_oaUr!RRo ztS;U3rEPGY>qQ)zcz@C$Bv8Nsj|8F>9Mo=hi2y}j*cD#g!;s{0 zj?<{DuGBwVvkmEaE?w6DEQoL~2_wyginrPofxn@5Z1d0({H?M>gE&c3+AHx2>twy=jk($G8pe78tl@`#_Rux z(mhWLos+AhHOarIVJ%t(3hx%SwN(Ai}e>*vaIOh5*o zc3@w1ZyVV80Fnh!;|<91mQ&D^=?C}PV}k%>7SuYdhp*(>tY*A%;?$cgy*u;Wu+0E) zPH;jSfgJ4SeT$d_832`A_wn+Dqn6{li#8aUo*~s9p805m>oTD@xMH$%LD32Q>cHZw zJty;?<9T?7YwXgMK~7>Eke#vG&5M4=TNqsp)U8pjAQ{vOjGxp*9=xH0r4T?Q1Ka5)-B#oG{|xfeVABDyBheWt!zvsQ<{C@ za&goK(*|7K$*gv3j^L_>m^qGi%a@$|RO67EZGY&26U(aOYcOJ-yE!h2_jXJFfLXlGeI>qP#5^ zbS}th<8+Ya#yJ%$%DMiw5-V*Z`eQG&x(N(>&E3-m8@8Q26Ecm}U&uV>wh-AIcNH+I zkI@dcho&_$2MrDl907`YfXCt%00ZRq$6vK0zEFU&z9gcgK+#Idz_N{UMkScl$U~8?RJt1LhM(F4Tbf#y36{&PH zy;4+cS^dZt#knn$k__o)xP5$+#kgvR#;Qfd0xb?*{8&hdV_+-t`PUyFY-P(Oo-kgz z&az3Klt0bM(62_Z{>JC0JBl7}+k0pbVvdRnXWwi%JgdvRu2UYX!Y)YpN_C-q6mH5C z@(4pM`S%^rgMDuHOAaJbl7z4ig;3IN%+exj?G-KvW3$(rW?hoK&xvi-+xjGk4qM2# z0qi4IAGe}gQ>9(WE4p^hnaz#6<9fmGn;tR$@jKfUn!gxf;8=K5JUX7SfN zRlk(>dhbWN%~jr%!E$`shh=|p1^e1(;r?Uz%wLjWxUVll2#pv-yk^mvxWq%V-F$lY z;BQ0sa;ggNx6T9?l-@>i&t*(ymP2uxG<{b91)NITx^Z9Eje>*?<3d4%bgjlU3j*GtvBJxi|19kFL z+|iCFmE6x9KdGD@wY`L&Y6*qy7HZRW*Ty0Kql$Ujp*~NVXQGK}Mn1E!Ip#KuRaJ+7 zwNK5`?Ps>|8oaB6j>nlq&BLSdAA`a(7GlCO8V`%kO^m(eoa{RvaC()0;!;i=H39u) zYfvB>|H;VkTx3ZnW-9!%2jGKaX)Ko@qD!D5P`P6KM3WS23v@`$4$jE$%m*}+#%rxY z7yHib2=;iWl6GS5AETMQ>r7O1{6^3MR;-vza#F6tXuHU2%?C61Fwge6&aXD2H@hDUn*I>=;{rI(J;@HFroFe+Jv%g5Qc{s9g=~41Y1f5GlDa!Xr0Ix!#nT6fe zvL{b3_U1NO4>nAeS zv4mjJ9d8w=-;SAz{#qPDv|`~?<>IYswO22<(AqtSI}Wb$ktD|R6j9DIXgB0z)thxc zh|R{T&2sdNIvm`i%EMxB^0e>GCCfb9eX4#>rQ;w`kM>m^5>3C^a+pIT1GzDGZnxkR z*I>Ls2nEB9li}$6OSy&a-9kr`a2jLoii{R75zRofu5h}UIdRyA^nr}envNhEcW)kS z_tzP6(~l=!?tPrT3uc~8q6HtSG%}z14@qE;_v^`J$r+q!o;d_FCp8=h;|}hM-EJ%g z?KKZLqLjU3rG1!bP8P^@I}LAJA<;^7so84MiNuB+Jfzpnite%>{?G#A1Jr#onL47$q*4^2@}%6PIID6K?Lm z?Uq!Kklrmv-B}qi?}F4t4ZGjXgPjaEev>vy zQ!EnG=SIpl93NuRhpd9GBQFhAJUnzL{c{Aa(IB=9f33$`_#`1JU(2UDC3MWEaFwK0 z=rz{;*2|c9wmH@Rap%16S9k~a1n^ha7W#S#)8K6H6!6r|?=@2Vx$LoJvy7mpqw{`_ z19mba^`fZE95=%+C7H#(i2nJ>zRUfEhl%Fc7A1e!hIsX}Mvxv|XdS_t-4(9A05F}x zznPAA0!UmAs|H-pfAFNMZZv8v0ZHtHht` zs(-P}Ha+^{;!!b?Gb5ErbzUWWgkKgb4=z4mKcU|9`}u*ck2cN)l?sf;VT=%Qj5?-i z_E+j3e73=Vm#yBa{c>Lnh4%;;yx*B0Op_BWx*ZmREwe4Yn^VmpyXVJFdE4fPO+8G* zi&xnoH^;1&Hw5)%hw{pWG70i{X!~}<>8mQUDMo%*KF6_4+#2pTExx;8Zv3&)msaVGzHYN=nE`hV^=X!HS=hT|1`{vu#9qD0 zQSh-$b#37+;6-mwHNW9_#+OswIXltSRQPA_M~9?uB6{bo_B~eGsyqxB3r&+Ow~d6A z8>Uy}UKeC;y=t!_7rf+4Iq@$g`_1szdpfnoj+(a(`D^MsI~oxqWdw6lr|D4^8`%F+ z`flI-?+u{dSt{;6R=Bd@;4xmH>+&Y@Hxc&xnuL~oIgNj@%U&Y0o{FArKXA_bb|8#3 zfA{1|iO9&cN7W@z{d)8#3-({YJV~9Cqw)7hCC;Af&H9e}uVn1Ob8jYoS~Q>h^0>4t zkB~gH>hYXEC1WL!EA)?q_@AymxHU5tFD{ndtuOeyXz5vzBHamH9?4joC)$+JKXg7^ zcB(PjqYiM+591wMo_?>HyziB8acX~%ka?dbo46y`m7kiLZI~CZ={w#k^^>}tn$5}9 z{4d5)m+VT*9R-6QYpaPl9-eCaJZ5xE!)Pxw-or@fQqjd2F%U4k*{b6GdBxZ3w;t{& zLWVv6Q~`ea1|pV|z^_zyzNP8ve1iWY|3rUb014RQ#Bpx`Jd?K<9kZTlyr4+` z_Pgcd3=suKzhe$kSG(s{|9VS53V^P2mkvA#J%t6ZBaC0Zk<|hFKRENbYw)5tud(m3 zzWclYtN!Q>-u&l7K#%No)%JhB`+HX{uZk8Ar23y`VeIk$*IohW(^stT(iyYqN!y6T zR1BZ=0|w0VYt!Y*e^BXrd+znS12fvc3sPfZV%;w=Kgyu-4uNNij<{9@mV9q2{`Vr2 zfg4wjkjXFaOKS6Z7XjeO{pf^F6kadlvL+QiGo2Rr*zc2MFQ2+3~{oqHpHU6K{_5c16pdu{ucc7*GnI+Wq z8nNzVd7NXGnqTaV+BjJ(nSEL z52GmYr)jsCBQL|iE`Pt+ktgVKB(=xdJo3DO+Sp9blbU;af0$FmK|c4tsM>KYb)P?0#uu5USo#;?+kvC( z{RV5vq?e`6@r`GvyGOmDq9sWGpv6!2e@Q+m4@={%|LZ35_5E$H{GSICbxm@tPr>fQ zD`x#QaW>s(z*-68oW0@Y50Oo^>S@#ry`gX_{gaK3NutxK^U#s^Of`$6e%K;u;@M{q zEyEnsQvuoCFP2~Bcqo}M-fDody5tro3bfw}Ju|s+u1FfI_`4|T*0%#Bu%g+%4A`Gn zL4{!AnP(RW^ZyF_LBRd5`6)jRwzu%>Z#e0<*?k$FCx7L$)wmN61y*!t2?zPDjJGcx z1`uT-_^i(B1&8FpxZ4(8Wm`{Lc6}^w|3c1iG+V-{9tYwTqFRP1U`6jjy+Oa&a4&U3 zj+Ld-wEmPBp;cX>7&OUx4eot%jZv3tI!e2$SRU$){N)n%8ebuJ2w2gSHag|#xrA-6 zyq^b+|GzmVSFX;esXIX;{;Cps3= z#*!AE2nz?^;oUx210AO3{~o&jKc%3?Pl?i|?b5+uFB?VC@K0*5jANR1BlJi7)VG=vGyeymb z7y;?U2B#G2D7@VwApUoLDLB#e!^$60gnKu=HJ}rnoSRC7!gIh};F<$|>HqmH^#AJ* z?9b9=o(uloYEZ||gYVgg-M6jNf4XmMpJlIpqxOHwsqeVYs6p?xm;d-em5B@P!(A67 zKl^=q>*DA>@SZqrAD?vM|Hh%|C_HXczOA;6w)szvcICrO_9swG8JRUiHV}9tsRDDCW!~3hTYj>Tq{Eoj>>J=?mLZ< z5>0N?#xoIRTn1;&?Aqt@Jo`E4{Cl3g{$J1fzO~->{eJInz3=ZmdCu?yWwPIsIA|)% zd;bn+%;42D)o`_s#mwJ_cg0_9!4*XbS{K_YtbKc%g}Zy|ccJP5iYgGp^B{Gx(ELaq5k$kuS(OMW6dOyw+P&^G?*<7KSwK z?^HA_3)zTu@G}rgq|8=kY^} zKGlLKEeJzOaCdjj4SX{)|K}Yh*-B*umTA34a2>2`=!mq7?6lwaqtqz?xwaBc^FG=q%_+>s6K^2~fabcL z0yIqn#3boaL*MP@V_u@JnQx6lG_603NJ#9A;dNT7PcoNw%4e|I&|H|C=V=8JDBH-tIO>)YSp4i3mD*S{CBEMgUmWXB zD`+)r(e^c-O*?5)Wp`Fz)j$fP?n48#rY&IN8`Cn2ETKB-qW6vN&d$3&H$#?nXWI)D;EI~han9}183O=? zw>p`w^5r#SM0j>ktVxCrzFcQ`_BuOC)xhsY71ZI<;zrX@HRxr|9%jzUfFN)nm+{uu zi^Yqe$nusR_>2;7b1wP6-_hpu?WC@gTA~`B%1J0|4l)BxWV(k#3irCK^O3lC_=os? zST@~G3p}=-j2#YY&HWY#9;g6EJu=upern7))*#QyU3l*L%6n6ylnotyq9ILi_p8{6 zAJs@+I%LLn?LI1(&S=E<%JZ+85);Q@iYx^~P3_xHvbV53#k{WD zv!~}1BIofk6|4A{|FF?l%K8VU!*1INfeXO~&|5!}o_0RTh$)#lrml~IMp8E758zWh zLh6l}6Nls;O9~xs+A&9&)8R|SxTYbB(}`3}uPVI%XUM||Rj_UI#QFJJp=Wid({#w^ zqgn=b^%+rR-^$QbNPQP%H~>!J3@x1t

v`fcd^LNrN#Iq6fMFV5kJj$$LK}r5hL> zi&ps>h##on;Mk?Zj9)}i;T0Wa=B)`!WqkCdcJ8womBY&Dlo1>>hra8Ema||ga-35R zx{@tk^lFrg?Eo~1w5=v@wm*q#*ts8i2H+T%`>LOZgvu0|k_$@8gy)yWg(!$9E}@U9 zHDkg1rr>~IE9q+3QvRAb3>heE4;6;JUrzQ5*Us2#ah zZuk}dJ-gy@oGBULcr{3(a-Q554kAZ`&>%p(fHskYOlYcWTT4a181%Ya>RPU$IXg!$ z7fy}u&*Jk{f}hzzqHI*u)ijK&C*;y%SjAio+73Ujs`UcnQ)*Q(tgrm?ip8;)UL}3$ zPt=95vd_P=VB!D~t9v}0BKv-%WX?uEjD9WynjvbWjfU@Jvzz9Y(mpykJXStO-vhnu zD%Ije1~4`4#vZq5i#PHq4@=_U8Njam4ytU^vn7RGEf5JB;*x+g(yc2C*r2>ZC5((t|4d>#8ealn*Q?tdYMgNC;se4k}+H z<$Qv}?UstAq&nm|cFV=x@zO^&MTgREWwrv7%Ud!xvc)kC5v+JE41PAT^T+;~OuASG%Q=-_97hA4N)YBQq`%3dQ@WVm+Kp9C;n~0hr{7mg+J4VKeN04$Lvl_TckZl zs}m)_tChcHn5*osx?(zC>p%WinjsY44;4lj3x6;L-xO)2ab8_kJKjGd43h4>DrNl)oZquS99BZIoPN;`u?M}CAT+Xt*$UW{UVv3NL0)AnN zMASY(Jk22?53@*iL8x5rjo=mNu#WsSZe?oMXjAyYryWA_BN;&(2L=hyErahYCi?O^ z^hazD^|Ny#2B)9f*qW@y@%y^GhAj#Pe>&9^92C0k#l@pXQy>Ar!_7@UJf+PHJ*_I- z`@>?`^2g@u**>``A{99~5rY``P0LOS8t5#ds-);dsNF!c%7EjtP<5AxeM#B/dev/null - @cp $(SRCDIR)/lib$(LIBNAME).a $(INSTALL_ROOT) 2>/dev/null - -tests: $(TESTS) $(FTESTS) - -clean: ; @$(RM) $(SRCDIR)/*.o $(SRCDIR)/*.s $(SRCDIR)/*.a $(TESTS) $(TESTS)/*.o $(FTESTS) $(FTESTS)/*.o - -cmds: ; @echo LIBP_MPICC = $(CCCMD); echo LINK = $(LINKCMD); - -deps: ; ./cdep.py *.c > makefile.cdep; - -odepinfo: deps objects; @./odep_info.py *.o - -$(TESTS): % : %.o | lib - $(LINKCMD) - -$(FTESTS): % : %.o | lib - $(FCCMD) $^ -o $@ -L$(SRCDIR) -l$(LIBNAME) - --include makefile.cdep - -%.o: %.c -ifneq (,${verbose}) - $(CCCMD) -c $< -o $@ -else - @printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n"; - @$(CCCMD) -c $< -o $@ -endif - -%.o: %.f -ifneq (,${verbose}) - $(FCCMD) -c $< -o $@ -else - @printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n"; - @$(FCCMD) -c $< -o $@ -endif - -%.s: %.c -ifneq (,${verbose}) - $(CCCMD) -S $< -o $@ -else - @printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n"; - @$(CCCMD) -S $< -o $@ -endif - -objects: $(OBJECTS) - -#poly_imp.h: gen_poly_imp.c -# $(RM) poly_imp.h; -# $(LIBP_MPICC) -lgmp -lm gen_poly_imp.c -o gen_poly_imp; -# ./gen_poly_imp > poly_imp.h; -# $(RM) gen_poly_imp diff --git a/3rdParty/gslib/README.md b/3rdParty/gslib/README.md deleted file mode 100644 index 8ded873a3..000000000 --- a/3rdParty/gslib/README.md +++ /dev/null @@ -1,23 +0,0 @@ -# GSLIB - -[![Build Status](https://travis-ci.org/gslib/gslib.svg?branch=master)](https://travis-ci.org/gslib/gslib) - -* Scalable Many-to-Many collectives -* Robust interpolation for hexahedral spectral element meshes - -# Build Instructions - -The build system relies on GNU Make with the `make` command. To compile gslib just run: - -``` -make CC=mpicc FC=mpif77 -make PREFIX= install -``` - -# Applications - -**\[1] [Nek5000](https://nek5000.mcs.anl.gov/)**: Nek5000 open-source, spectral element code. - -**\[2] [CEED](http://ceed.exascaleproject.org/)**: Co-design center for Efficient Exascale Discretizations. - -**\[3] [Nektar++](http://www.nektar.info)**: Nektar++ open-source spectral/hp element code. diff --git a/3rdParty/gslib/RELEASE.md b/3rdParty/gslib/RELEASE.md deleted file mode 100644 index fdaf06f49..000000000 --- a/3rdParty/gslib/RELEASE.md +++ /dev/null @@ -1,17 +0,0 @@ -# Release 1.0.3 - -## Major Features and Improvements -* Added non-blocking gather/scatter operations (CR not supported yet) -* Added Fortran wrapper for gs_unique -* Added gs_hf2c to convert Fortran into C handle - -## Backwards-Incompatible Changes -* Removed XXT and AMG solver from distribution - -## Bug Fixes and Other Changes - -[17](https://github.com/gslib/gslib/issues/17) - -## Thanks to our Contributors -This release contains contributions from: @stgeke -We are also grateful to all who filed issues or helped resolve them, asked and answered questions, and were part of inspiring discussions. diff --git a/3rdParty/gslib/cdep.py b/3rdParty/gslib/cdep.py deleted file mode 100755 index a0dd87a50..000000000 --- a/3rdParty/gslib/cdep.py +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/python - -import sys, os, re - -#mergestr = lambda x: reduce((lambda a,b: a+" "+b),x,"") - -pathjoin = lambda a,b: os.path.normpath(os.path.join(a,b)) -include_re = re.compile("\s*#\s*include\s*\"([^\"]*)\"") -incmatch = lambda x: ( include_re.match(line) for line in open(x) ) -incline = lambda x,m: pathjoin(os.path.split(x)[0],m.group(1)) -incl = lambda x: [ incline(x,m) for m in incmatch(x) if m!=None ] -includes = {} -def get_include(x): - if not includes.has_key(x): includes[x] = incl(x) - return includes[x] - -def closure(seq,f): - v = [], [x for x in seq], set(x for x in seq) - while len(v[1]): [(v[1].append(y),v[2].add(y)) for y in - f((lambda x: (v[0].append(x),x)[1])(v[1].pop())) if not y in v[2]] - return v[0] - -src_files = sys.argv[1:] -files = closure(src_files, get_include) -deps = dict((x,closure(includes[x],lambda y: includes[y])) for x in src_files) - -obj = lambda x: os.path.splitext(x)[0]+".o" - -for x in src_files: - print obj(x)+": "+x+reduce((lambda a,b: a+" "+b),deps[x],"") - -print -print "OBJECTS="+reduce((lambda a,b: a+" "+obj(b)),src_files,"") diff --git a/3rdParty/gslib/makefile.cdep b/3rdParty/gslib/makefile.cdep deleted file mode 100644 index e5c6ee766..000000000 --- a/3rdParty/gslib/makefile.cdep +++ /dev/null @@ -1,42 +0,0 @@ -amg.o: amg.c gs.h sarray_transfer.h crystal.h comm.h gs_defs.h sarray_sort.h sort.h mem.h fail.h types.h name.h c99.h -comm.o: comm.c comm.h gs_local.h gs_defs.h tensor.h types.h fail.h name.h -comm_test.o: comm_test.c comm.h gs_defs.h types.h fail.h name.h -crs_test.o: crs_test.c crs.h gs.h comm.h gs_defs.h mem.h types.h fail.h name.h c99.h -crystal.o: crystal.c mem.h comm.h types.h fail.h name.h c99.h -crystal_test.o: crystal_test.c crystal.h mem.h comm.h types.h fail.h name.h c99.h -fail.o: fail.c comm.h types.h fail.h name.h -fcrs.o: fcrs.c crs.h comm.h mem.h types.h fail.h name.h c99.h -fcrystal.o: fcrystal.c sarray_transfer.h sarray_sort.h sort.h crystal.h comm.h mem.h types.h fail.h name.h c99.h -findpts.o: findpts.c findpts_imp.h findpts_imp.h sarray_sort.h sort.h sarray_transfer.h crystal.h comm.h gs_defs.h findpts_local.h findpts_el.h obbox.h poly.h mem.h fail.h types.h name.h c99.h -findpts_el_2.o: findpts_el_2.c poly.h tensor.h mem.h types.h fail.h name.h c99.h -findpts_el_2_test2.o: findpts_el_2_test2.c rand_elt_test.h findpts_el.h obbox.h lob_bnd.h poly.h tensor.h mem.h fail.h name.h types.h c99.h -findpts_el_2_test.o: findpts_el_2_test.c findpts_el.h poly.h mem.h fail.h types.h name.h c99.h -findpts_el_3.o: findpts_el_3.c poly.h tensor.h mem.h types.h fail.h name.h c99.h -findpts_el_3_test2.o: findpts_el_3_test2.c rand_elt_test.h findpts_el.h obbox.h lob_bnd.h poly.h tensor.h mem.h fail.h name.h types.h c99.h -findpts_el_3_test.o: findpts_el_3_test.c findpts_el.h poly.h mem.h fail.h types.h name.h c99.h -findpts_local.o: findpts_local.c findpts_local_imp.h findpts_local_imp.h findpts_el.h sarray_sort.h sort.h poly.h obbox.h mem.h fail.h name.h types.h c99.h -findpts_local_test.o: findpts_local_test.c rand_elt_test.h findpts_local.h findpts_el.h obbox.h poly.h types.h mem.h fail.h name.h c99.h -findpts_test.o: findpts_test.c sarray_transfer.h crystal.h findpts.h rand_elt_test.h comm.h gs_defs.h poly.h mem.h types.h fail.h name.h c99.h -gen_poly_imp.o: gen_poly_imp.c -gs.o: gs.c sarray_transfer.h sarray_sort.h crystal.h sort.h mem.h comm.h gs_local.h gs_defs.h types.h fail.h name.h c99.h -gs_local.o: gs_local.c gs_defs.h types.h name.h c99.h -gs_test.o: gs_test.c gs.h gs_defs.h mem.h comm.h types.h fail.h name.h c99.h -gs_test_old.o: gs_test_old.c types.h name.h -gs_unique_test.o: gs_unique_test.c gs.h gs_defs.h mem.h comm.h types.h fail.h name.h c99.h -lob_bnd.o: lob_bnd.c poly.h mem.h fail.h types.h name.h c99.h -lob_bnd_test.o: lob_bnd_test.c lob_bnd.h poly.h tensor.h mem.h fail.h name.h types.h c99.h -obbox.o: obbox.c lob_bnd.h poly.h tensor.h mem.h types.h fail.h name.h c99.h -obbox_test.o: obbox_test.c rand_elt_test.h obbox.h lob_bnd.h poly.h mem.h fail.h name.h types.h c99.h -poly.o: poly.c poly_imp.h mem.h types.h fail.h name.h c99.h -poly_test.o: poly_test.c poly.h types.h name.h c99.h -rand_elt_test.o: rand_elt_test.c lob_bnd.h poly.h name.h types.h c99.h -sarray_sort.o: sarray_sort.c sort.h mem.h fail.h types.h name.h c99.h -sarray_sort_test.o: sarray_sort_test.c sarray_sort.h sort.h mem.h types.h fail.h name.h c99.h -sarray_transfer.o: sarray_transfer.c sort.h crystal.h mem.h comm.h types.h fail.h name.h c99.h -sarray_transfer_test.o: sarray_transfer_test.c sarray_transfer.h crystal.h sarray_sort.h sort.h mem.h comm.h types.h fail.h name.h c99.h -sort.o: sort.c sort_imp.h sort_imp.h sort_imp.h mem.h types.h fail.h name.h c99.h -sort_test2.o: sort_test2.c sort.h mem.h types.h fail.h name.h c99.h -sort_test.o: sort_test.c sort.h mem.h types.h fail.h name.h c99.h -tensor.o: tensor.c types.h name.h c99.h - -OBJECTS= comm.o comm_test.o crs_test.o crystal.o crystal_test.o fail.o fcrs.o fcrystal.o findpts.o findpts_el_2.o findpts_el_2_test2.o findpts_el_2_test.o findpts_el_3.o findpts_el_3_test2.o findpts_el_3_test.o findpts_local.o findpts_local_test.o findpts_test.o gen_poly_imp.o gs.o gs_local.o gs_test.o gs_test_old.o gs_unique_test.o lob_bnd.o lob_bnd_test.o obbox.o obbox_test.o poly.o poly_test2.o poly_test.o rand_elt_test.o sarray_sort.o sarray_sort_test.o sarray_transfer.o sarray_transfer_test.o sort.o sort_test2.o sort_test.o tensor.o diff --git a/3rdParty/gslib/odep_info.py b/3rdParty/gslib/odep_info.py deleted file mode 100755 index 620d0ec4a..000000000 --- a/3rdParty/gslib/odep_info.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/python - -import sys, os, re - -obj_files = sys.argv[1:] - -defined = dict((x,set([])) for x in obj_files) -undefined = dict((x,set([])) for x in obj_files) -nm_re = re.compile("[0-9a-fA-F]*\s*([BCDRTU])\s+([A-Za-z_][A-Za-z_0-9]*)\s*") -def nm_match(x): return ( nm_re.match(line) for line in os.popen('nm -g '+x) ) -def nm_line(x,m): - if m.group(1)=='U': undefined[x].add(m.group(2)) - else: defined[x].add(m.group(2)) -[ [ nm_line(x,m) for m in nm_match(x) if m!=None ] for x in obj_files ] - -def closure(seq,f): - v = [], [x for x in seq], set(x for x in seq) - while len(v[1]): [(v[1].append(y),v[2].add(y)) for y in - f((lambda x: (v[0].append(x),x)[1])(v[1].pop())) if not y in v[2]] - return v[0] - -needs={} -def get_needs(x): - if not needs.has_key(x): - needs[x]=[y for y in obj_files if len(defined[y]&undefined[x])] - return needs[x] -deps = dict((x,closure(get_needs(x),get_needs)) for x in obj_files) - -for x in deps: - print x,'depends on',reduce((lambda a,b: a+" "+b),deps[x],"") -print - -results = [ os.path.splitext(x)[0] for x in obj_files if 'main' in defined[x] ] -print "RESULTS="+reduce((lambda a,b: a+" "+b),results,"") -print - -def need_X(objs): - for x in objs: - if "XOpenDisplay" in undefined[x]: return True - return False - -for x in results: - objs = deps[x+'.o']; - if not (x+'.o') in objs: objs.append(x+'.o') - sobjs = reduce((lambda a,b: a+" "+b),objs,"") - if need_X(objs): - print x+":"+sobjs+" ; @echo LINK $@; $(LINKCMD) $^ -lX11 -o $@" - else: - print x+":"+sobjs+" ; @echo LINK $@; $(LINKCMD) $^ -o $@" - diff --git a/3rdParty/gslib/src/c99.h b/3rdParty/gslib/src/c99.h deleted file mode 100644 index a5a44e3a6..000000000 --- a/3rdParty/gslib/src/c99.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef C99_H -#define C99_H - -#ifndef __STDC_VERSION__ -# define NO_C99 -#elif __STDC_VERSION__ < 199901L -# define NO_C99 -#endif - -#ifdef NO_C99 -# define restrict -# define inline -# undef NO_C99 -#endif - -#endif diff --git a/3rdParty/gslib/src/comm.c b/3rdParty/gslib/src/comm.c deleted file mode 100644 index e537278f4..000000000 --- a/3rdParty/gslib/src/comm.c +++ /dev/null @@ -1,210 +0,0 @@ -#include /* for size_t */ -#include /* for exit */ -#include /* memcpy */ -#include /* for gs identities */ -#include /* for gs identities */ -#include "name.h" -#include "fail.h" -#include "types.h" -#include "tensor.h" -#include "gs_defs.h" -#include "gs_local.h" -#include "comm.h" - -uint comm_gbl_id=0, comm_gbl_np=1; - -GS_DEFINE_IDENTITIES() -GS_DEFINE_DOM_SIZES() - -static void scan_imp(void *scan, const struct comm *com, gs_dom dom, gs_op op, - const void *v, uint vn, void *buffer) -{ - comm_req req[2]; - size_t vsize = vn*gs_dom_size[dom]; - const uint id=com->id, np=com->np; - uint n = np, c=1, odd=0, base=0; - void *buf[2]; - void *red = (char*)scan+vsize; - buf[0]=buffer,buf[1]=(char*)buffer+vsize; - while(n>1) { - odd=(odd<<1)|(n&1); - c<<=1, n>>=1; - if(id>=base+n) c|=1, base+=n, n+=(odd&1); - } - gs_init_array(scan,vn,dom,op); - memcpy(red,v,vsize); - while(n>=1, n<<=1, n+=(odd&1); - odd>>=1; - if(base==id) { - comm_irecv(&req[0],com, buf[0],vsize, id+n/2,id+n/2); - comm_isend(&req[1],com, red ,vsize, id+n/2,id); - comm_wait(req,2); - gs_gather_array(red,buf[0],vn,dom,op); - } else { - comm_irecv(&req[0],com, scan,vsize, base,base); - comm_isend(&req[1],com, red ,vsize, base,id); - comm_wait(req,2); - break; - } - } - while(n>1) { - if(base==id) { - comm_send(com, scan ,2*vsize, id+n/2,id); - } else { - comm_recv(com, buffer,2*vsize, base,base); - gs_gather_array(scan,buf[0],vn,dom,op); - memcpy(red,buf[1],vsize); - } - odd=(odd<<1)|(n&1); - c<<=1, n>>=1; - if(id>=base+n) c|=1, base+=n, n+=(odd&1); - } -} - - -static void allreduce_imp(const struct comm *com, gs_dom dom, gs_op op, - void *v, uint vn, void *buf) -{ - size_t total_size = vn*gs_dom_size[dom]; - const uint id=com->id, np=com->np; - uint n = np, c=1, odd=0, base=0; - while(n>1) { - odd=(odd<<1)|(n&1); - c<<=1, n>>=1; - if(id>=base+n) c|=1, base+=n, n+=(odd&1); - } - while(n>=1, n<<=1, n+=(odd&1); - odd>>=1; - if(base==id) { - comm_recv(com, buf,total_size, id+n/2,id+n/2); - gs_gather_array(v,buf,vn, dom,op); - } else { - comm_send(com, v,total_size, base,id); - break; - } - } - while(n>1) { - if(base==id) - comm_send(com, v,total_size, id+n/2,id); - else - comm_recv(com, v,total_size, base,base); - odd=(odd<<1)|(n&1); - c<<=1, n>>=1; - if(id>=base+n) c|=1, base+=n, n+=(odd&1); - } -} - -void comm_scan(void *scan, const struct comm *com, gs_dom dom, gs_op op, - const void *v, uint vn, void *buffer) -{ - scan_imp(scan, com,dom,op, v,vn, buffer); -} - -void comm_allreduce(const struct comm *com, gs_dom dom, gs_op op, - void *v, uint vn, void *buf) -{ - if(vn==0) return; -#ifdef MPI - { - MPI_Datatype mpitype; - MPI_Op mpiop; - #define DOMAIN_SWITCH() do { \ - switch(dom) { case gs_double: mpitype=MPI_DOUBLE; break; \ - case gs_float: mpitype=MPI_FLOAT; break; \ - case gs_int: mpitype=MPI_INT; break; \ - case gs_long: mpitype=MPI_LONG; break; \ - WHEN_LONG_LONG(case gs_long_long: mpitype=MPI_LONG_LONG; break;) \ - default: goto comm_allreduce_byhand; \ - } \ - } while(0) - DOMAIN_SWITCH(); - #undef DOMAIN_SWITCH - switch(op) { case gs_add: mpiop=MPI_SUM; break; - case gs_mul: mpiop=MPI_PROD; break; - case gs_min: mpiop=MPI_MIN; break; - case gs_max: mpiop=MPI_MAX; break; - default: goto comm_allreduce_byhand; - } - MPI_Allreduce(v,buf,vn,mpitype,mpiop,com->c); - memcpy(v,buf,vn*gs_dom_size[dom]); - return; - } -#endif -#ifdef MPI -comm_allreduce_byhand: - allreduce_imp(com,dom,op, v,vn, buf); -#endif -} - -void comm_iallreduce(comm_req *req, const struct comm *com, gs_dom dom, gs_op op, - void *v, uint vn, void *buf) -{ - if(vn==0) return; -#ifdef MPI - { - MPI_Datatype mpitype; - MPI_Op mpiop; - #define DOMAIN_SWITCH() do { \ - switch(dom) { case gs_double: mpitype=MPI_DOUBLE; break; \ - case gs_float: mpitype=MPI_FLOAT; break; \ - case gs_int: mpitype=MPI_INT; break; \ - case gs_long: mpitype=MPI_LONG; break; \ - WHEN_LONG_LONG(case gs_long_long: mpitype=MPI_LONG_LONG; break;) \ - default: goto comm_allreduce_byhand; \ - } \ - } while(0) - DOMAIN_SWITCH(); - #undef DOMAIN_SWITCH - switch(op) { case gs_add: mpiop=MPI_SUM; break; - case gs_mul: mpiop=MPI_PROD; break; - case gs_min: mpiop=MPI_MIN; break; - case gs_max: mpiop=MPI_MAX; break; - default: goto comm_allreduce_byhand; - } - MPI_Iallreduce(v,buf,vn,mpitype,mpiop,com->c,req); - return; - } -#endif -#ifdef MPI -comm_allreduce_byhand: - allreduce_imp(com,dom,op, v,vn, buf); -#endif -} - -double comm_dot(const struct comm *comm, double *v, double *w, uint n) -{ - double s=tensor_dot(v,w,n),b; - comm_allreduce(comm,gs_double,gs_add, &s,1, &b); - return s; -} - -/* T comm_reduce__T(const struct comm *comm, gs_op op, const T *in, uint n) */ - -#define SWITCH_OP_CASE(T,OP) case gs_##OP: WITH_OP(T,OP); break; -#define SWITCH_OP(T,op) do switch(op) { \ - GS_FOR_EACH_OP(T,SWITCH_OP_CASE) case gs_op_n: break; } while(0) - -#define WITH_OP(T,OP) \ - do { T v = *in++; GS_DO_##OP(accum,v); } while(--n) - -#define DEFINE_REDUCE(T) \ -T PREFIXED_NAME(comm_reduce__##T)( \ - const struct comm *comm, gs_op op, const T *in, uint n) \ -{ \ - T accum = gs_identity_##T[op], buf; \ - if(n!=0) SWITCH_OP(T,op); \ - comm_allreduce(comm,gs_##T,op, &accum,1, &buf); \ - return accum; \ -} - -GS_FOR_EACH_DOMAIN(DEFINE_REDUCE) - -#undef DEFINE_REDUCE -#undef WITH_OP -#undef SWITCH_OP -#undef SWITCH_OP_CASE - diff --git a/3rdParty/gslib/src/comm.h b/3rdParty/gslib/src/comm.h deleted file mode 100644 index 1bd88264a..000000000 --- a/3rdParty/gslib/src/comm.h +++ /dev/null @@ -1,259 +0,0 @@ -#ifndef COMM_H -#define COMM_H - -/* requires: - for size_t - for exit - "fail.h", "types.h" - "gs_defs.h" for comm_allreduce, comm_scan, comm_reduce_T -*/ - -#if !defined(FAIL_H) || !defined(TYPES_H) -#warning "comm.h" requires "fail.h" and "types.h" -#endif - -/* - When the preprocessor macro MPI is defined, defines (very) thin wrappers - for the handful of used MPI routines. Alternatively, when MPI is not defined, - these wrappers become dummy routines suitable for a single process run. - No code outside of "comm.h" and "comm.c" makes use of MPI at all. - - Basic usage: - - struct comm c; - - comm_init(&c, MPI_COMM_WORLD); // initializes c using MPI_Comm_dup - - comm_free(&c); - - Very thin MPI wrappers: (see below for implementation) - - comm_send,_recv,_isend,_irecv,_time,_barrier - - Additionally, some reduction and scan routines are provided making use - of the definitions in "gs_defs.h" (provided this has been included first). - - Example comm_allreduce usage: - - double v[5], buf[5]; - comm_allreduce(&c, gs_double,gs_add, v,5,buf); - // Computes the vector sum of v across all procs, using - // buf as a scratch area. Delegates to MPI_Allreduce if possible. - - Example comm_scan usage: - - long in[5], out[2][5], buf[2][5]; - comm_scan(out, &c,gs_long,gs_add, in,5,buf); - // out[0] will be the vector sum of "in" across procs with ids - *strictly* less than this one (exclusive behavior), - and out[1] will be the vector sum across all procs, as would - be computed with comm_allreduce. - Note: differs from MPI_Scan which has inclusive behavior - - Example comm_reduce_double, etc. usage: - - T out, in[10]; - out = comm_reduce_T(&c, gs_max, in, 10); - // out will equal the largest element of "in", - across all processors - // T can be "double", "float", "int", "long", "slong", "sint", etc. - as defined in "gs_defs.h" - -*/ - -#ifdef MPI -#include -typedef MPI_Comm comm_ext; -typedef MPI_Request comm_req; -#else -typedef int comm_ext; -typedef int comm_req; -typedef int MPI_Fint; -#endif - -#define comm_allreduce PREFIXED_NAME(comm_allreduce ) -#define comm_iallreduce PREFIXED_NAME(comm_iallreduce) -#define comm_scan PREFIXED_NAME(comm_scan ) -#define comm_dot PREFIXED_NAME(comm_dot ) - -/* global id, np vars strictly for diagnostic messages (fail.c) */ -#ifndef comm_gbl_id -#define comm_gbl_id PREFIXED_NAME(comm_gbl_id) -#define comm_gbl_np PREFIXED_NAME(comm_gbl_np) -extern uint comm_gbl_id, comm_gbl_np; -#endif - -struct comm { - uint id, np; - comm_ext c; -}; - -static void comm_init(struct comm *c, comm_ext ce); -/* (macro) static void comm_init_check(struct comm *c, MPI_Fint ce, uint np); */ -/* (macro) static void comm_dup(struct comm *d, const struct comm *s); */ -static void comm_free(struct comm *c); -static double comm_time(void); -static void comm_barrier(const struct comm *c); -static void comm_recv(const struct comm *c, void *p, size_t n, - uint src, int tag); -static void comm_send(const struct comm *c, void *p, size_t n, - uint dst, int tag); -static void comm_irecv(comm_req *req, const struct comm *c, - void *p, size_t n, uint src, int tag); -static void comm_isend(comm_req *req, const struct comm *c, - void *p, size_t n, uint dst, int tag); -static void comm_wait(comm_req *req, int n); - -double comm_dot(const struct comm *comm, double *v, double *w, uint n); - -#ifdef GS_DEFS_H -void comm_allreduce(const struct comm *com, gs_dom dom, gs_op op, - void *v, uint vn, void *buf); -void comm_iallreduce(comm_req *req, const struct comm *com, gs_dom dom, gs_op op, - void *v, uint vn, void *buf); -void comm_scan(void *scan, const struct comm *com, gs_dom dom, gs_op op, - const void *v, uint vn, void *buffer); - -#define DEFINE_REDUCE(T) \ -T PREFIXED_NAME(comm_reduce__##T)( \ - const struct comm *comm, gs_op op, const T *in, uint n); \ -static T comm_reduce_##T(const struct comm *c, gs_op op, const T *v, uint vn) \ -{ return PREFIXED_NAME(comm_reduce__##T)(c,op,v,vn); } -GS_FOR_EACH_DOMAIN(DEFINE_REDUCE) -#undef DEFINE_REDUCE - -#define comm_reduce_sint \ - TYPE_LOCAL(comm_reduce_int,comm_reduce_long,comm_reduce_long_long) -#define comm_reduce_slong \ - TYPE_GLOBAL(comm_reduce_int,comm_reduce_long,comm_reduce_long_long) - -#endif - -/*---------------------------------------------------------------------------- - Code for static (inline) functions - ----------------------------------------------------------------------------*/ - -static void comm_init(struct comm *c, comm_ext ce) -{ -#ifdef MPI - int i; - MPI_Comm_dup(ce, &c->c); - MPI_Comm_rank(c->c,&i), comm_gbl_id=c->id=i; - MPI_Comm_size(c->c,&i), comm_gbl_np=c->np=i; -#else - c->id = 0, c->np = 1; -#endif -} - -static void comm_init_check_(struct comm *c, MPI_Fint ce, uint np, - const char *file, unsigned line) -{ -#ifdef MPI - comm_init(c,MPI_Comm_f2c(ce)); - if(c->np != np) - fail(1,file,line,"comm_init_check: passed P=%u, " - "but MPI_Comm_size gives P=%u", - (unsigned)np,(unsigned)c->np); -#else - comm_init(c,0); - if(np != 1) - fail(1,file,line,"comm_init_check: passed P=%u, " - "but not compiled with -DMPI",(unsigned)np); -#endif -} -#define comm_init_check(c,ce,np) comm_init_check_(c,ce,np,__FILE__,__LINE__) - - -static void comm_dup_(struct comm *d, const struct comm *s, - const char *file, unsigned line) -{ - d->id = s->id, d->np = s->np; -#ifdef MPI - MPI_Comm_dup(s->c,&d->c); -#else - if(s->np!=1) fail(1,file,line,"%s not compiled with -DMPI\n",file); -#endif -} -#define comm_dup(d,s) comm_dup_(d,s,__FILE__,__LINE__) - -static void comm_free(struct comm *c) -{ -#ifdef MPI - MPI_Comm_free(&c->c); -#endif -} - -static double comm_time(void) -{ -#ifdef MPI - return MPI_Wtime(); -#else - return 0; -#endif -} - -static void comm_barrier(const struct comm *c) -{ -#ifdef MPI - MPI_Barrier(c->c); -#endif -} - -static void comm_recv(const struct comm *c, void *p, size_t n, - uint src, int tag) -{ -#ifdef MPI -# ifndef MPI_STATUS_IGNORE - MPI_Status stat; - MPI_Recv(p,n,MPI_UNSIGNED_CHAR,src,tag,c->c,&stat); -# else - MPI_Recv(p,n,MPI_UNSIGNED_CHAR,src,tag,c->c,MPI_STATUS_IGNORE); -# endif -#endif -} - -static void comm_send(const struct comm *c, void *p, size_t n, - uint dst, int tag) -{ -#ifdef MPI - MPI_Send(p,n,MPI_UNSIGNED_CHAR,dst,tag,c->c); -#endif -} - -static void comm_irecv(comm_req *req, const struct comm *c, - void *p, size_t n, uint src, int tag) -{ -#ifdef MPI - MPI_Irecv(p,n,MPI_UNSIGNED_CHAR,src,tag,c->c,req); -#endif -} - -static void comm_isend(comm_req *req, const struct comm *c, - void *p, size_t n, uint dst, int tag) -{ -#ifdef MPI - MPI_Isend(p,n,MPI_UNSIGNED_CHAR,dst,tag,c->c,req); -#endif -} - -static void comm_wait(comm_req *req, int n) -{ -#ifdef MPI -# ifndef MPI_STATUSES_IGNORE - MPI_Status status[8]; - while(n>=8) MPI_Waitall(8,req,status), req+=8, n-=8; - if(n>0) MPI_Waitall(n,req,status); -# else - MPI_Waitall(n,req,MPI_STATUSES_IGNORE); -# endif -#endif -} - -static void comm_bcast(const struct comm *c, void *p, size_t n, uint root) -{ -#ifdef MPI - MPI_Bcast(p,n,MPI_UNSIGNED_CHAR,root,c->c); -#endif -} - -#endif diff --git a/3rdParty/gslib/src/crs.h b/3rdParty/gslib/src/crs.h deleted file mode 100644 index eeb60d33c..000000000 --- a/3rdParty/gslib/src/crs.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef CRS_H -#define CRS_H - -#if !defined(COMM_H) -#warning "crs.h" requires "comm.h" -#endif - -#define crs_xxt_setup PREFIXED_NAME(crs_xxt_setup) -#define crs_xxt_solve PREFIXED_NAME(crs_xxt_solve) -#define crs_xxt_stats PREFIXED_NAME(crs_xxt_stats) -#define crs_xxt_free PREFIXED_NAME(crs_xxt_free ) - -#define crs_amg_setup PREFIXED_NAME(crs_amg_setup) -#define crs_amg_solve PREFIXED_NAME(crs_amg_solve) -#define crs_amg_stats PREFIXED_NAME(crs_amg_stats) -#define crs_amg_free PREFIXED_NAME(crs_amg_free ) - -struct crs_data; - -struct crs_data *crs_xxt_setup( - uint n, const ulong *id, - uint nz, const uint *Ai, const uint *Aj, const double *A, - uint null_space, const struct comm *comm); -void crs_xxt_solve(double *x, struct crs_data *data, double *b); -void crs_xxt_stats(struct crs_data *data); -void crs_xxt_free(struct crs_data *data); - -struct crs_data *crs_amg_setup( - uint n, const ulong *id, - uint nz, const uint *Ai, const uint *Aj, const double *A, - uint null_space, const struct comm *comm); -void crs_amg_solve(double *x, struct crs_data *data, double *b); -void crs_amg_stats(struct crs_data *data); -void crs_amg_free(struct crs_data *data); - -#endif diff --git a/3rdParty/gslib/src/crystal.c b/3rdParty/gslib/src/crystal.c deleted file mode 100644 index a0e813508..000000000 --- a/3rdParty/gslib/src/crystal.c +++ /dev/null @@ -1,141 +0,0 @@ -/*------------------------------------------------------------------------------ - - Crystal Router - - Accomplishes all-to-all communication in log P msgs per proc - The routine is low-level; the format of the input/output is an - array of integers, consisting of a sequence of messages with format: - - target proc - source proc - m - integer - integer - ... - integer (m integers in total) - - Before crystal_router is called, the source of each message should be - set to this proc id; upon return from crystal_router, the target of each - message will be this proc id. - - Example Usage: - - struct crystal cr; - - crystal_init(&cr, &comm); // makes an internal copy of comm - - crystal.data.n = ... ; // total number of integers (not bytes!) - buffer_reserve(&cr.data, crystal.n * sizeof(uint)); - ... // fill cr.data.ptr with messages - crystal_router(&cr); - - crystal_free(&cr); - - ----------------------------------------------------------------------------*/ - -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#include "comm.h" -#include "mem.h" - -#define crystal_init PREFIXED_NAME(crystal_init ) -#define crystal_free PREFIXED_NAME(crystal_free ) -#define crystal_router PREFIXED_NAME(crystal_router) - -struct crystal { - struct comm comm; - buffer data, work; -}; - -void crystal_init(struct crystal *p, const struct comm *comm) -{ - comm_dup(&p->comm, comm); - buffer_init(&p->data,1000); - buffer_init(&p->work,1000); -} - -void crystal_free(struct crystal *p) -{ - comm_free(&p->comm); - buffer_free(&p->data); - buffer_free(&p->work); -} - -static void uintcpy(uint *dst, const uint *src, uint n) -{ - if(dst+n<=src) memcpy (dst,src,n*sizeof(uint)); - else if(dst!=src) memmove(dst,src,n*sizeof(uint)); -} - -static uint crystal_move(struct crystal *p, uint cutoff, int send_hi) -{ - uint len, *src, *end; - uint *keep = p->data.ptr, *send; - uint n = p->data.n; - send = buffer_reserve(&p->work,n*sizeof(uint)); - if(send_hi) { /* send hi, keep lo */ - for(src=keep,end=keep+n; src=cutoff) memcpy (send,src,len*sizeof(uint)), send+=len; - else uintcpy(keep,src,len), keep+=len; - } - } else { /* send lo, keep hi */ - for(src=keep,end=keep+n; srcdata.n = keep - (uint*)p->data.ptr; - return send - (uint*)p->work.ptr; -} - -static void crystal_exchange(struct crystal *p, uint send_n, uint targ, - int recvn, int tag) -{ - comm_req req[3]; - uint count[2] = {0,0}, sum, *recv[2]; - - if(recvn) - comm_irecv(&req[1],&p->comm, &count[0],sizeof(uint), targ ,tag); - if(recvn==2) - comm_irecv(&req[2],&p->comm, &count[1],sizeof(uint), p->comm.id-1,tag); - comm_isend(&req[0],&p->comm, &send_n,sizeof(uint), targ,tag); - comm_wait(req,recvn+1); - - sum = p->data.n + count[0] + count[1]; - buffer_reserve(&p->data,sum*sizeof(uint)); - recv[0] = (uint*)p->data.ptr + p->data.n, recv[1] = recv[0] + count[0]; - p->data.n = sum; - - if(recvn) comm_irecv(&req[1],&p->comm, - recv[0],count[0]*sizeof(uint), targ ,tag+1); - if(recvn==2) comm_irecv(&req[2],&p->comm, - recv[1],count[1]*sizeof(uint), p->comm.id-1,tag+1); - comm_isend(&req[0],&p->comm, p->work.ptr,send_n*sizeof(uint), targ,tag+1); - comm_wait(req,recvn+1); -} - -void crystal_router(struct crystal *p) -{ - uint bl=0, bh, nl; - uint id = p->comm.id, n=p->comm.np; - uint send_n, targ, tag = 0; - int send_hi, recvn; - while(n>1) { - nl = (n+1)/2, bh = bl+nl; - send_hi = id /* sprintf, vfprintf, stdout */ -#include /* va_list, va_start, ... */ -#include /* exit */ -#include /* memcpy, and str* functions in comm_fail */ -#include "name.h" -#include "fail.h" -#include "types.h" -#include "comm.h" - -#ifdef USE_USR_EXIT -#define userExitHandler FORTRAN_NAME(userexithandler,USEREXITHANDLER) -#define USEREXIT 1 -extern void userExitHandler(int status); -#else -#define USEREXIT 0 -void userExitHandler(int status) {}; -#endif - -void die(int status) -{ - if (USEREXIT) { - userExitHandler(status); - while(1); - } else { - exit(status); - while(1); - } -} - -void vdiagnostic(const char *prefix, const char *file, unsigned line, - const char *fmt, va_list ap) -{ - static char buf[2048]; int n,na,i=0; - sprintf(buf,"%s(proc %04d, %s:%d): ",prefix,(int)comm_gbl_id,file,line); - vsprintf(buf+strlen(buf),fmt,ap); - strcat(buf,"\n"); - n=strlen(buf); - while(n && (na=fwrite(buf+i,1,n,stdout))) n-=na, i+=na; - fflush(stdout); -} - -void diagnostic(const char *prefix, const char *file, unsigned line, - const char *fmt, ...) -{ - va_list ap; va_start(ap,fmt); - vdiagnostic(prefix,file,line,fmt,ap); - va_end(ap); -} - -void vfail(int status, const char *file, unsigned line, - const char *fmt, va_list ap) -{ - vdiagnostic("ERROR ",file,line,fmt,ap); - die(status); -} - -void fail(int status, const char *file, unsigned line, - const char *fmt, ...) -{ - va_list ap; va_start(ap,fmt); - vfail(status,file,line,fmt,ap); - va_end(ap); -} diff --git a/3rdParty/gslib/src/fail.h b/3rdParty/gslib/src/fail.h deleted file mode 100644 index 018511088..000000000 --- a/3rdParty/gslib/src/fail.h +++ /dev/null @@ -1,52 +0,0 @@ -#ifndef FAIL_H -#define FAIL_H - -#if !defined(NAME_H) -#warning "fail.h" requires "name.h" -#endif - -#define die PREFIXED_NAME( die ) -#define vdiagnostic PREFIXED_NAME(vdiagnostic) -#define diagnostic PREFIXED_NAME( diagnostic) -#define vfail PREFIXED_NAME(vfail ) -#define fail PREFIXED_NAME( fail ) - -#ifdef __GNUC__ -# define ATTRBD __attribute__ ((noreturn)) -# define ATTRB4V __attribute__ ((format(printf,4,0))) -# define ATTRB4 __attribute__ ((format(printf,4,5))) -# define ATTRB4DV __attribute__ ((noreturn,format(printf,4,0))) -# define ATTRB4D __attribute__ ((noreturn,format(printf,4,5))) -#else -# define ATTRBD -# define ATTRB4V -# define ATTRB4 -# define ATTRB4DV -# define ATTRB4D -#endif - -#define DEF_FUNS() \ - void die(int status) ATTRBD; \ - void diagnostic(const char *prefix, const char *file, unsigned line, \ - const char *fmt, ...) ATTRB4 ; \ - void fail (int status, const char *file, unsigned line, \ - const char *fmt, ...) ATTRB4D ; -#define VDEF_FUNS() \ - void vdiagnostic(const char *prefix, const char *file, unsigned line, \ - const char *fmt, va_list ap) ATTRB4V ; \ - void vfail (int status, const char *file, unsigned line, \ - const char *fmt, va_list ap) ATTRB4DV ; -DEF_FUNS() -#ifdef va_arg -VDEF_FUNS() -#endif - -#undef VDEF_FUNS -#undef DEF_FUNS -#undef ATTRB4D -#undef ATTRB4DV -#undef ATTRB4 -#undef ATTRB4V -#undef ATTRBD - -#endif diff --git a/3rdParty/gslib/src/fcrystal.c b/3rdParty/gslib/src/fcrystal.c deleted file mode 100644 index 3fe4c9ae9..000000000 --- a/3rdParty/gslib/src/fcrystal.c +++ /dev/null @@ -1,191 +0,0 @@ -#include -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#include "mem.h" -#include "comm.h" -#include "crystal.h" -#include "sort.h" -#include "sarray_sort.h" -#include "sarray_transfer.h" - -/*-------------------------------------------------------------------------- - - FORTRAN Interface to crystal router - - integer h, np - MPI_Comm comm - call crystal_setup(h,comm,np) ! set h to handle to new instance - ! it is a runtime error if MPI_Comm_size gives a value different than np - call crystal_free(h) ! release instance - - integer*? ituple(m,max) ! integer type matching sint from "types.h" - call crystal_ituple_transfer(h, ituple,m,n,max, kp) - - moves each column ituple(:,i), 1 <= i <= n, - to proc ituple(kp,i) - - sets n to the number of columns received, - which may be larger than max (indicating loss of n-max columns) - - also sets ituple(kp,i) to the source proc of column ituple(:,i) - - call crystal_ituple_sort(h, ituple,m,n, key,nkey) - - locally sorts columns ituple(:,1...n) in ascending order, - ranked by ituple(key(1),i), - then ituple(key(2),i), - ... - then ituple(key(nkey),i) - - no communication; h used for scratch area - - linear time - - assumes nonnegative integers - - integer*? vi(mi,max) ! integer type matching sint from "types.h" - integer*? vl(ml,max) ! integer type matching slong from "types.h" - real vr(mr,max) - call crystal_tuple_transfer(h,n,max, vi,mi,vl,ml,vr,mr, kp) - - moves each column vi(:,i),vl(:,i),vr(:,i) 1 <= i <= n, - to proc vi(kp,i) - - sets n to the number of columns received, - which may be larger than max (indicating loss of n-max columns) - - also sets vi(kp,i) to the source proc of columns vi(:,i),vl(:,i),vr(:,i) - - call crystal_tuple_sort(h,n, vi,mi,vl,ml,vr,mr, key,nkey) - - locally sorts columns vi/vl/vr (:,1...n) in ascending order, - ranked by vi(key(1),i) [ or vl(key(1)-mi,i) if key(1)>mi ], - then vi(key(2),i) [ or vl(key(2)-mi,i) if key(2)>mi ], - ... - then vi(key(nkey),i) or vl(key(nkey)-mi,i) - - no communication; h used for scratch area - - linear time - - assumes nonnegative integers - - sorting on reals not yet implemented - - --------------------------------------------------------------------------*/ - -#undef crystal_free -#define ccrystal_free PREFIXED_NAME(crystal_free) - -#define fcrystal_setup \ - FORTRAN_NAME(crystal_setup ,CRYSTAL_SETUP ) -#define fcrystal_ituple_sort \ - FORTRAN_NAME(crystal_ituple_sort ,CRYSTAL_ITUPLE_SORT ) -#define fcrystal_tuple_sort \ - FORTRAN_NAME(crystal_tuple_sort ,CRYSTAL_TUPLE_SORT ) -#define fcrystal_ituple_transfer \ - FORTRAN_NAME(crystal_ituple_transfer,CRYSTAL_ITUPLE_TRANSFER) -#define fcrystal_tuple_transfer \ - FORTRAN_NAME(crystal_tuple_transfer ,CRYSTAL_TUPLE_TRANSFER ) -#define fcrystal_free \ - FORTRAN_NAME(crystal_free ,CRYSTAL_FREE ) - -static struct crystal **handle_array = 0; -static int handle_max = 0; -static int handle_n = 0; - -void fcrystal_setup(sint *handle, const MPI_Fint *comm, const sint *np) -{ - struct crystal *p; - if(handle_n==handle_max) - handle_max+=handle_max/2+1, - handle_array=trealloc(struct crystal*,handle_array,handle_max); - handle_array[handle_n]=p=tmalloc(struct crystal,1); - comm_init_check(&p->comm, *comm, *np); - buffer_init(&p->data,1000); - buffer_init(&p->work,1000); - *handle = handle_n++; -} - -#define CHECK_HANDLE(func) do \ - if(*handle<0 || *handle>=handle_n || !handle_array[*handle]) \ - fail(1,__FILE__,__LINE__,func ": invalid handle"); \ -while(0) - -void fcrystal_ituple_sort(const sint *handle, - sint A[], const sint *m, const sint *n, - const sint keys[], const sint *nkey) -{ - const size_t size = (*m)*sizeof(sint); - sint nk = *nkey; - buffer *buf; - CHECK_HANDLE("crystal_ituple_sort"); - buf = &handle_array[*handle]->data; - if(--nk>=0) { - sortp(buf,0, (uint*)&A[keys[nk]-1],*n,size); - while(--nk>=0) - sortp(buf,1, (uint*)&A[keys[nk]-1],*n,size); - sarray_permute_buf_(ALIGNOF(sint),size,A,*n, buf); - } -} - -void fcrystal_tuple_sort(const sint *const handle, const sint *const n, - sint Ai[], const sint *const mi, - slong Al[], const sint *const ml, - double Ad[], const sint *const md, - const sint keys[], const sint *const nkey) -{ - const size_t size_i = (*mi)*sizeof(sint), - size_l = (*ml)*sizeof(slong), - size_d = (*md)*sizeof(double); - int init=0; - sint nk = *nkey; - buffer *buf; - CHECK_HANDLE("crystal_tuple_sort"); - buf = &handle_array[*handle]->data; - if(nk<=0) return; - while(--nk>=0) { - sint k = keys[nk]-1; - if(k<0 || k>=*mi+*ml) - fail(1,__FILE__,__LINE__,"crystal_tuple_sort: invalid key"); - else if(k<*mi) sortp (buf,init, (uint *)&Ai[k], *n,size_i); - else sortp_long(buf,init, (ulong*)&Al[k-*mi],*n,size_l); - init=1; - } - if(*mi) sarray_permute_buf_(ALIGNOF(sint ),size_i,Ai,*n, buf); - if(*ml) sarray_permute_buf_(ALIGNOF(slong ),size_l,Al,*n, buf); - if(*md) sarray_permute_buf_(ALIGNOF(double),size_d,Ad,*n, buf); -} - -void fcrystal_ituple_transfer(const sint *handle, - sint A[], const sint *m, sint *n, - const sint *nmax, const sint *proc_key) -{ - struct array ar, *const ar_ptr = &ar; - const unsigned size=(*m)*sizeof(sint); - CHECK_HANDLE("crystal_ituple_transfer"); - ar.ptr=A, ar.n=*n, ar.max=*nmax; - *n = sarray_transfer_many(&ar_ptr,&size,1, 1,0,1,(*proc_key-1)*sizeof(sint), - (uint*)&A[*proc_key-1],size, handle_array[*handle]); -} - -void fcrystal_tuple_transfer( - const sint *const handle, sint *const n, const sint *const max, - sint Ai[], const sint *const mi, - slong Al[], const sint *const ml, - double Ad[], const sint *const md, - const sint *const proc_key) -{ - struct array ar_i, ar_l, ar_d, *ar[3]; - unsigned size[3]; - CHECK_HANDLE("crystal_tuple_transfer"); - size[0]=*mi*sizeof(sint); - size[1]=*ml*sizeof(slong); - size[2]=*md*sizeof(double); - ar[0]=&ar_i, ar[1]=&ar_l, ar[2]=&ar_d; - ar_i.ptr=Ai,ar_l.ptr=Al,ar_d.ptr=Ad; - ar_i.n=ar_l.n=ar_d.n = *n; - ar_i.max=ar_l.max=ar_d.max=*max; - *n = sarray_transfer_many(ar,size,3, 1,0,1,(*proc_key-1)*sizeof(sint), - (uint*)&Ai[*proc_key-1],size[0], handle_array[*handle]); -} - -void fcrystal_free(sint *handle) -{ - CHECK_HANDLE("crystal_free"); - ccrystal_free(handle_array[*handle]); - free(handle_array[*handle]); - handle_array[*handle] = 0; -} - - diff --git a/3rdParty/gslib/src/findpts.c b/3rdParty/gslib/src/findpts.c deleted file mode 100644 index 1ed472f36..000000000 --- a/3rdParty/gslib/src/findpts.c +++ /dev/null @@ -1,369 +0,0 @@ -#include -#include -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "types.h" -#include "fail.h" -#include "mem.h" -#include "poly.h" -#include "obbox.h" -#include "findpts_el.h" -#include "findpts_local.h" -#include "gs_defs.h" -#include "comm.h" -#include "crystal.h" -#include "sarray_transfer.h" -#include "sort.h" -#include "sarray_sort.h" -/* -#define DIAGNOSTICS -*/ -#ifdef DIAGNOSTICS -#include -#endif - -#define CODE_INTERNAL 0 -#define CODE_BORDER 1 -#define CODE_NOT_FOUND 2 - -struct ulong_range { ulong min, max; }; -struct proc_index { uint proc, index; }; - -static slong lfloor(double x) { return floor(x); } -static slong lceil (double x) { return ceil (x); } - -static ulong hash_index_aux(double low, double fac, ulong n, double x) -{ - const slong i = lfloor((x-low)*fac); - return i<0 ? 0 : (n-1<(ulong)i ? n-1 : (ulong)i); -} - -static void set_bit(unsigned char *const p, const uint i) -{ - const uint byte = i/CHAR_BIT; - const unsigned bit = i%CHAR_BIT; - p[byte] |= 1u<>bit & 1u; -} - -static unsigned byte_bits(const unsigned char x) -{ - unsigned bit, sum=0; - for(bit=0;bit>bit & 1u; - return sum; -} - -static uint count_bits(unsigned char *p, uint n) -{ - uint sum=0; - for(;n;--n) sum+=byte_bits(*p++); - return sum; -} - -#define D 2 -#define WHEN_3D(a) -#include "findpts_imp.h" -#undef WHEN_3D -#undef D - -#define D 3 -#define WHEN_3D(a) a -#include "findpts_imp.h" -#undef WHEN_3D -#undef D - -/*-------------------------------------------------------------------------- - - FORTRAN Interface - - -------------------------------------------------------------------------- - call findpts_setup(h, comm,np, ndim, xm,ym,zm, nr,ns,nt,nel, - mr,ms,mt, bbox_tol, loc_hash_size, gbl_hash_size, - npt_max, newt_tol) - - (zm,nt,mt all ignored when ndim==2) - - h: (output) handle - comm,np: MPI communicator and # of procs (checked against MPI_Comm_size) - ndim: 2 or 3 - xm,ym,zm: element geometry (nodal x,y,z values) - nr,ns,nt,nel: element dimensions --- e.g., xm(nr,ns,nt,nel) - - mr,ms,mt: finer mesh size for bounding box computation; - must be larger than nr,ns,nt for correctness, - recommend at least 2*nr,2*ns,2*nt - bbox_tol: e.g., 0.01 - relative size to expand bounding boxes by; - prevents points from falling through "cracks", - and prevents "not found" failures for points just outside mesh - (returning instead the closest point inside the mesh) - - loc_hash_size: e.g., nr*ns*nt*nel - maximum number of integers to use for local geom hash table; - minimum is nel+2 for the trivial table with one cell - - gbl_hash_size: e.g., nr*ns*nt*nel - approx number of cells per proc for the distributed - global geometric hash table - NOTE: gbl_hash_size*np needs to fit in a "global" integer - (controlled by -DGLOBAL_LONG or -DGLOBAL_LONG_LONG; - see "types.h") - actual number of cells per proc will be greater by - ~ 3 gbl_hash_size^(2/3) / np^(1/3) - - npt_max: e.g., 256 - number of points to iterate on simultaneously - enables dominant complexity to be matrix-matrix products - (there is a sweet spot --- too high and the cache runs out) - the memory allocation term dependent on npt_max is - (12 + 2*(nr+ns+nt+nr*ns)) * npt_max doubles - - newt_tol: e.g., 1024*DBL_EPSILON - the iteration stops for a point when - the 1-norm of the step in (r,s,t) is smaller than newt_tol - or the objective (dist^2) increases while the predicted (model) - decrease is smaller than newt_tol * (the objective) - - -------------------------------------------------------------------------- - call findpts_free(h) - - -------------------------------------------------------------------------- - call findpts(h, code_base, code_stride, - proc_base, proc_stride, - el_base, el_stride, - r_base, r_stride, - dist2_base, dist2_stride, - x_base, x_stride, - y_base, y_stride, - z_base, z_stride, npt) - - (z_base, z_stride ignored when ndim==2) - - conceptually, locates npt points; - data for each point is: - ouput: - code: 0 - inside an element - 1 - closest point on a border - (perhaps exactly, or maybe just near --- check dist2) - 2 - not found (bbox_tol controls cut-off between code 1 and 2) - proc: remote processor on which the point was found - el: element on remote processor in which the point was found - r(ndim): parametric coordinates for point - dist2: distance squared from found to sought point (in xyz space) - input: - x, y, z: coordinates of sought point - - the *_base arguments point to the data for the first point, - each is advanced by the corresponding *_stride argument for the next point - this allows fairly arbitrary data layout, - but note the r,s,t coordinates for each point must be packed together - (consequently, r_stride must be at least ndim) - - - -------------------------------------------------------------------------- - call findpts_eval(h, out_base, out_stride, - code_base, code_stride, - proc_base, proc_stride, - el_base, el_stride, - r_base, r_stride, npt, - input_field) - - may be called immediately after findpts (or any other time) - to evaluate input_field at the given points --- - these specified by code,proc,el,r(ndim) and possibly remote - --- storing the interpolated values in out - [that is, at out_base(1+out_stride*(point_index-1)) ] - - for example, following a call to findpts, a call to findpts_eval with - input_field = xm, will ideally result in out = x(1) for each point, - or x(2) for ym, x(3) for zm - - - -------------------------------------------------------------------------- - call findpts_eval_local(h, - out_base, out_stride, - el_base, el_stride, - r_base, r_stride, npt, - input_field) - - just like findpts_eval, but does assumes all points are local, - and does no communication. will use matrix-matrix products if - points are grouped by element. - - --------------------------------------------------------------------------*/ - -#define ffindpts_setup FORTRAN_NAME(findpts_setup ,FINDPTS_SETUP ) -#define ffindpts_free FORTRAN_NAME(findpts_free ,FINDPTS_FREE ) -#define ffindpts FORTRAN_NAME(findpts ,FINDPTS ) -#define ffindpts_eval FORTRAN_NAME(findpts_eval ,FINDPTS_EVAL ) -#define ffindpts_eval_local FORTRAN_NAME(findpts_eval_local,FINDPTS_EVAL_LOCAL) - -struct handle { void *data; unsigned ndim; }; -static struct handle *handle_array = 0; -static int handle_max = 0; -static int handle_n = 0; - -void ffindpts_setup(sint *const handle, - const MPI_Fint *const comm, const sint *const np, - const sint *ndim, - const double *const xm, const double *const ym, const double *const zm, - const sint *const nr, const sint *const ns, const sint *const nt, - const sint *const nel, - const sint *const mr, const sint *const ms, const sint *const mt, - const double *const bbox_tol, - const sint *const loc_hash_size, const sint *const gbl_hash_size, - const sint *const npt_max, - const double *const newt_tol) -{ - struct handle *h; - if(handle_n==handle_max) - handle_max+=handle_max/2+1, - handle_array=trealloc(struct handle,handle_array,handle_max); - h = &handle_array[handle_n]; - h->ndim = *ndim; - if(h->ndim==2) { - struct findpts_data_2 *const fd = tmalloc(struct findpts_data_2,1); - const double *elx[2]; - uint n[2], m[2]; - elx[0]=xm,elx[1]=ym; - n[0]=*nr,n[1]=*ns; - m[0]=*mr,m[1]=*ms; - h->data = fd; - comm_init_check(&fd->cr.comm, *comm, *np); - buffer_init(&fd->cr.data,1000); - buffer_init(&fd->cr.work,1000); - setup_aux_2(fd, elx,n,*nel,m,*bbox_tol, - *loc_hash_size,*gbl_hash_size, *npt_max, *newt_tol); - } else if(h->ndim==3) { - struct findpts_data_3 *const fd = tmalloc(struct findpts_data_3,1); - const double *elx[3]; - uint n[3], m[3]; - elx[0]=xm,elx[1]=ym,elx[2]=zm; - n[0]=*nr,n[1]=*ns,n[2]=*nt; - m[0]=*mr,m[1]=*ms,m[2]=*mt; - h->data = fd; - comm_init_check(&fd->cr.comm, *comm, *np); - buffer_init(&fd->cr.data,1000); - buffer_init(&fd->cr.work,1000); - setup_aux_3(fd, elx,n,*nel,m,*bbox_tol, - *loc_hash_size,*gbl_hash_size, *npt_max, *newt_tol); - } else - fail(1,__FILE__,__LINE__, - "findpts_setup: ndim must be 2 or 3; given ndim=%u",(unsigned)h->ndim); - *handle = handle_n++; -} - -#define CHECK_HANDLE(func) \ - struct handle *h; \ - if(*handle<0 || *handle>=handle_n || !(h=&handle_array[*handle])->data) \ - fail(1,__FILE__,__LINE__,func ": invalid handle") - -void ffindpts_free(const sint *const handle) -{ - CHECK_HANDLE("findpts_free"); - if(h->ndim==2) - PREFIXED_NAME(findpts_free_2)(h->data); - else - PREFIXED_NAME(findpts_free_3)(h->data); - h->data = 0; -} - -void ffindpts(const sint *const handle, - sint *const code_base, const sint *const code_stride, - sint *const proc_base, const sint *const proc_stride, - sint *const el_base, const sint *const el_stride, - double *const r_base, const sint *const r_stride, - double *const dist2_base, const sint *const dist2_stride, - const double *const x_base, const sint *const x_stride, - const double *const y_base, const sint *const y_stride, - const double *const z_base, const sint *const z_stride, - const sint *const npt) -{ - CHECK_HANDLE("findpts"); - if(h->ndim==2) { - const double *xv_base[2]; - unsigned xv_stride[2]; - xv_base[0]=x_base, xv_base[1]=y_base; - xv_stride[0] = *x_stride*sizeof(double), - xv_stride[1] = *y_stride*sizeof(double); - PREFIXED_NAME(findpts_2)( - (uint*)code_base,(* code_stride)*sizeof(sint ), - (uint*)proc_base,(* proc_stride)*sizeof(sint ), - (uint*) el_base,(* el_stride)*sizeof(sint ), - r_base,(* r_stride)*sizeof(double), - dist2_base,(*dist2_stride)*sizeof(double), - xv_base, xv_stride, - *npt, h->data); - } else { - const double *xv_base[3]; - unsigned xv_stride[3]; - xv_base[0]=x_base, xv_base[1]=y_base, xv_base[2]=z_base; - xv_stride[0] = *x_stride*sizeof(double), - xv_stride[1] = *y_stride*sizeof(double), - xv_stride[2] = *z_stride*sizeof(double); - PREFIXED_NAME(findpts_3)( - (uint*)code_base,(* code_stride)*sizeof(sint ), - (uint*)proc_base,(* proc_stride)*sizeof(sint ), - (uint*) el_base,(* el_stride)*sizeof(sint ), - r_base,(* r_stride)*sizeof(double), - dist2_base,(*dist2_stride)*sizeof(double), - xv_base, xv_stride, - *npt, h->data); - } -} - -void ffindpts_eval(const sint *const handle, - double *const out_base, const sint *const out_stride, - const sint *const code_base, const sint *const code_stride, - const sint *const proc_base, const sint *const proc_stride, - const sint *const el_base, const sint *const el_stride, - const double *const r_base, const sint *const r_stride, - const sint *const npt, const double *const in) -{ - CHECK_HANDLE("findpts_eval"); - if(h->ndim==2) - PREFIXED_NAME(findpts_eval_2)( - out_base,(* out_stride)*sizeof(double), - (uint*)code_base,(*code_stride)*sizeof(sint ), - (uint*)proc_base,(*proc_stride)*sizeof(sint ), - (uint*) el_base,(* el_stride)*sizeof(sint ), - r_base,(* r_stride)*sizeof(double), - *npt, in, h->data); - else - PREFIXED_NAME(findpts_eval_3)( - out_base,(* out_stride)*sizeof(double), - (uint*)code_base,(*code_stride)*sizeof(sint ), - (uint*)proc_base,(*proc_stride)*sizeof(sint ), - (uint*) el_base,(* el_stride)*sizeof(sint ), - r_base,(* r_stride)*sizeof(double), - *npt, in, h->data); -} - -void ffindpts_eval_local(const sint *const handle, - double *const out_base, const sint *const out_stride, - const sint *const el_base, const sint *const el_stride, - const double *const r_base, const sint *const r_stride, - const sint *const npt, const double *const in) -{ - CHECK_HANDLE("findpts_eval_local"); - if(h->ndim==2) - findpts_local_eval_2( - out_base,(* out_stride)*sizeof(double), - (uint*) el_base,(* el_stride)*sizeof(sint ), - r_base,(* r_stride)*sizeof(double), - *npt, in, &((struct findpts_data_2 *)h->data)->local); - else - findpts_local_eval_3( - out_base,(* out_stride)*sizeof(double), - (uint*) el_base,(* el_stride)*sizeof(sint ), - r_base,(* r_stride)*sizeof(double), - *npt, in, &((struct findpts_data_3 *)h->data)->local); -} diff --git a/3rdParty/gslib/src/findpts.h b/3rdParty/gslib/src/findpts.h deleted file mode 100644 index 16846917c..000000000 --- a/3rdParty/gslib/src/findpts.h +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef FINDPTS_H -#define FINDPTS_H - -#if !defined(COMM_H) -#warning "findpts.h" requires "comm.h" -#endif - -#define findpts_setup_2 PREFIXED_NAME(findpts_setup_2) -#define findpts_free_2 PREFIXED_NAME(findpts_free_2 ) -#define findpts_2 PREFIXED_NAME(findpts_2 ) -#define findpts_eval_2 PREFIXED_NAME(findpts_eval_2 ) -#define findpts_setup_3 PREFIXED_NAME(findpts_setup_3) -#define findpts_free_3 PREFIXED_NAME(findpts_free_3 ) -#define findpts_3 PREFIXED_NAME(findpts_3 ) -#define findpts_eval_3 PREFIXED_NAME(findpts_eval_3 ) - -struct findpts_data_2; -struct findpts_data_3; - -struct findpts_data_2 *findpts_setup_2( - const struct comm *const comm, - const double *const elx[2], - const unsigned n[2], const uint nel, - const unsigned m[2], const double bbox_tol, - const uint local_hash_size, const uint global_hash_size, - const unsigned npt_max, const double newt_tol); - -struct findpts_data_3 *findpts_setup_3( - const struct comm *const comm, - const double *const elx[3], - const unsigned n[3], const uint nel, - const unsigned m[3], const double bbox_tol, - const uint local_hash_size, const uint global_hash_size, - const unsigned npt_max, const double newt_tol); - -void findpts_free_2(struct findpts_data_2 *fd); -void findpts_free_3(struct findpts_data_3 *fd); - -void findpts_2( uint *const code_base , const unsigned code_stride , - uint *const proc_base , const unsigned proc_stride , - uint *const el_base , const unsigned el_stride , - double *const r_base , const unsigned r_stride , - double *const dist2_base , const unsigned dist2_stride , - const double *const x_base[2], const unsigned x_stride[2], - const uint npt, struct findpts_data_2 *const fd); - -void findpts_3( uint *const code_base , const unsigned code_stride , - uint *const proc_base , const unsigned proc_stride , - uint *const el_base , const unsigned el_stride , - double *const r_base , const unsigned r_stride , - double *const dist2_base , const unsigned dist2_stride , - const double *const x_base[3], const unsigned x_stride[3], - const uint npt, struct findpts_data_3 *const fd); - -void findpts_eval_2( - double *const out_base, const unsigned out_stride, - const uint *const code_base, const unsigned code_stride, - const uint *const proc_base, const unsigned proc_stride, - const uint *const el_base, const unsigned el_stride, - const double *const r_base, const unsigned r_stride, - const uint npt, - const double *const in, struct findpts_data_2 *const fd); - -void findpts_eval_3( - double *const out_base, const unsigned out_stride, - const uint *const code_base, const unsigned code_stride, - const uint *const proc_base, const unsigned proc_stride, - const uint *const el_base, const unsigned el_stride, - const double *const r_base, const unsigned r_stride, - const uint npt, - const double *const in, struct findpts_data_3 *const fd); - -#endif diff --git a/3rdParty/gslib/src/findpts_el.h b/3rdParty/gslib/src/findpts_el.h deleted file mode 100644 index 4ed119aae..000000000 --- a/3rdParty/gslib/src/findpts_el.h +++ /dev/null @@ -1,122 +0,0 @@ -#ifndef FINDPTS_EL_H -#define FINDPTS_EL_H - -#if !defined(NAME_H) || !defined(POLY_H) -#warning "findpts_el.h" requires "name.h", "poly.h" -#endif - -#define findpts_el_setup_2 PREFIXED_NAME(findpts_el_setup_2) -#define findpts_el_free_2 PREFIXED_NAME(findpts_el_free_2 ) -#define findpts_el_2 PREFIXED_NAME(findpts_el_2 ) -#define findpts_el_eval_2 PREFIXED_NAME(findpts_el_eval_2 ) - -struct findpts_el_pt_2 { - double x[2],r[2],oldr[2],dist2,dist2p,tr; - unsigned index,flags; -}; - -struct findpts_el_gedge_2 { const double *x[2], *dxdn[2]; }; -struct findpts_el_gpt_2 { double x[2], jac[4], hes[4]; }; - -struct findpts_el_data_2 { - unsigned npt_max; - struct findpts_el_pt_2 *p; - - unsigned n[2]; - double *z[2]; - lagrange_fun *lag[2]; - double *lag_data[2]; - double *wtend[2]; - - const double *x[2]; - - unsigned side_init; - double *sides; - struct findpts_el_gedge_2 edge[4]; /* R S=-1; R S=1; ... */ - struct findpts_el_gpt_2 pt[4]; - - double *work; -}; - -void findpts_el_setup_2(struct findpts_el_data_2 *const fd, - const unsigned n[2], - const unsigned npt_max); -void findpts_el_free_2(struct findpts_el_data_2 *const fd); -void findpts_el_2(struct findpts_el_data_2 *fd, unsigned npt, const double tol); -void findpts_el_eval_2( - double *const out_base, const unsigned out_stride, - const double *const r_base, const unsigned r_stride, const unsigned pn, - const double *const in, struct findpts_el_data_2 *const fd); - -static void findpts_el_start_2(struct findpts_el_data_2 *const fd, - const double *const x[2]) -{ - fd->side_init=0,fd->x[0]=x[0],fd->x[1]=x[1]; -} - -static struct findpts_el_pt_2 *findpts_el_points_2( - struct findpts_el_data_2 *const fd) -{ - return fd->p; -} - -#define findpts_el_setup_3 PREFIXED_NAME(findpts_el_setup_3) -#define findpts_el_free_3 PREFIXED_NAME(findpts_el_free_3 ) -#define findpts_el_3 PREFIXED_NAME(findpts_el_3 ) -#define findpts_el_eval_3 PREFIXED_NAME(findpts_el_eval_3 ) - -struct findpts_el_pt_3 { - double x[3],r[3],oldr[3],dist2,dist2p,tr; - unsigned index,flags; -}; - -struct findpts_el_gface_3 { const double *x[3], *dxdn[3]; }; -struct findpts_el_gedge_3 { const double *x[3], *dxdn1[3], *dxdn2[3], - *d2xdn1[3], *d2xdn2[3]; }; -struct findpts_el_gpt_3 { double x[3], jac[9], hes[18]; }; - -struct findpts_el_data_3 { - unsigned npt_max; - struct findpts_el_pt_3 *p; - - unsigned n[3]; - double *z[3]; - lagrange_fun *lag[3]; - double *lag_data[3]; - double *wtend[3]; - - const double *x[3]; - - unsigned side_init; - double *sides; - struct findpts_el_gface_3 face[6]; /* ST R=-1,R=+1; TR S=-1,S=+1; ... */ - struct findpts_el_gedge_3 edge[12]; /* R S=-1,T=-1; R S=1,T=-1; ... */ - struct findpts_el_gpt_3 pt[8]; - - double *work; -}; - -void findpts_el_setup_3(struct findpts_el_data_3 *const fd, - const unsigned n[3], - const unsigned npt_max); -void findpts_el_free_3(struct findpts_el_data_3 *const fd); -void findpts_el_3(struct findpts_el_data_3 *const fd, const unsigned npt, - const double tol); -void findpts_el_eval_3( - double *const out_base, const unsigned out_stride, - const double *const r_base, const unsigned r_stride, const unsigned pn, - const double *const in, struct findpts_el_data_3 *const fd); - -static void findpts_el_start_3(struct findpts_el_data_3 *const fd, - const double *const x[3]) -{ - fd->side_init=0,fd->x[0]=x[0],fd->x[1]=x[1],fd->x[2]=x[2]; -} - -static struct findpts_el_pt_3 *findpts_el_points_3( - struct findpts_el_data_3 *const fd) -{ - return fd->p; -} - -#endif diff --git a/3rdParty/gslib/src/findpts_el_2.c b/3rdParty/gslib/src/findpts_el_2.c deleted file mode 100644 index 9d6eca601..000000000 --- a/3rdParty/gslib/src/findpts_el_2.c +++ /dev/null @@ -1,819 +0,0 @@ -#include - -#include -#include -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#include "mem.h" -#include "tensor.h" -#include "poly.h" - -#define findpts_el_setup_2 PREFIXED_NAME(findpts_el_setup_2) -#define findpts_el_free_2 PREFIXED_NAME(findpts_el_free_2 ) -#define findpts_el_2 PREFIXED_NAME(findpts_el_2 ) -#define findpts_el_eval_2 PREFIXED_NAME(findpts_el_eval_2 ) -/* -#define DIAGNOSTICS_1 -#define DIAGNOSTICS_2 -*/ -#define DIAGNOSTICS_ITERATIONS 0 - -#if defined(DIAGNOSTICS_1) || defined(DIAGNOSTICS_2) \ - || DIAGNOSTICS_ITERATIONS > 0 -#include -#endif - -/* A is row-major */ -static void lin_solve_2(double x[2], const double A[4], const double y[2]) -{ - const double idet = 1/(A[0]*A[3] - A[1]*A[2]); - x[0] = idet*(A[3]*y[0] - A[1]*y[1]); - x[1] = idet*(A[0]*y[1] - A[2]*y[0]); -} - -struct findpts_el_pt_2 { - double x[2],r[2],oldr[2],dist2,dist2p,tr; - unsigned index,flags; -}; - -/* the bit structure of flags is CSSRR - the C bit --- 1<<4 --- is set when the point is converged - RR is 0 = 00b if r is unconstrained, - 1 = 01b if r is constrained at -1 - 2 = 10b if r is constrained at +1 - SS is similarly for s constraints -*/ - -#define CONVERGED_FLAG (1u<<4) -#define FLAG_MASK 0x1fu - -static unsigned num_constrained(const unsigned flags) -{ - const unsigned y = flags | flags>>1; - return (y&1u) + (y>>2 & 1u); -} - -static unsigned pt_flags_to_bin_noC(const unsigned flags) -{ - return (flags>>2 & 3u)*3 + (flags & 3u); -} - -/* map flags to 9 if the C bit is set, - else to [0,8] --- the 9 valid configs of SSRR */ -static unsigned pt_flags_to_bin(const unsigned flags) -{ - const unsigned mask = 0u - (flags>>4); /* 0 or 0xfff... when converged */ - return (mask & 9u) | (~mask & pt_flags_to_bin_noC(flags)); -} - -/* assumes x = 0, or 1 */ -static unsigned plus_1_mod_2(const unsigned x) { return x^1u; } - -/* assumes x = 1 << i, with i < 4, returns i+1 */ -static unsigned which_bit(const unsigned x) -{ - const unsigned y = x&7u; - return (y-(y>>2)) | ((x-1)&4u); -} - -static unsigned edge_index(const unsigned x) { return which_bit(x)-1; } - -static unsigned point_index(const unsigned x) -{ - return ((x>>1)&1u) | ((x>>2)&2u); -} - -/* extra data - - we need x, dx/dn for each edge - r: x at 0, nrs - nr, - 4*nr extra for dx/dn - s: 8*ns extra - -*/ - -struct findpts_el_gedge_2 { const double *x[2], *dxdn[2]; }; -struct findpts_el_gpt_2 { double x[2], jac[4], hes[4]; }; - -struct findpts_el_data_2 { - unsigned npt_max; - struct findpts_el_pt_2 *p; - - unsigned n[2]; - double *z[2]; - lagrange_fun *lag[2]; - double *lag_data[2]; - double *wtend[2]; - - const double *x[2]; - - unsigned side_init; - double *sides; - struct findpts_el_gedge_2 edge[4]; /* R=-1 S; R=1 S; ... */ - struct findpts_el_gpt_2 pt[4]; - - double *work; -}; - -/* work[2*(nr+ns)] */ -/* work[4*(nr+ns)] */ -/* work[6*(nr+6)] */ -/* work[(6+2*(2*nr+ns))*pn] */ -/* work[(10+3*n)*pn] */ -static unsigned work_size( - const unsigned nr, const unsigned ns, const unsigned npt_max) -{ - const unsigned n = ns>nr?ns:nr; - unsigned wsize; - #define DO_MAX(x) do { const unsigned temp=(x); \ - wsize=temp>wsize?temp:wsize; } while(0) - wsize = (6 + 2*(2*nr+ns)) * npt_max; - DO_MAX(4*(nr+ns)); - DO_MAX(6*(nr+6)); - DO_MAX(npt_max*(10+3*n)); - #undef DO_MAX - return wsize; -} - -void findpts_el_setup_2(struct findpts_el_data_2 *const fd, - const unsigned n[2], - const unsigned npt_max) -{ - const unsigned nr=n[0], ns=n[1]; - const unsigned tot = 8*ns + 4*nr; - unsigned d,i, lag_size[2]; - - fd->npt_max = npt_max; - fd->p = tmalloc(struct findpts_el_pt_2, npt_max*2); - - fd->n[0]=nr, fd->n[1]=ns; - for(d=0;d<2;++d) lag_size[d] = gll_lag_size(fd->n[d]); - - fd->z[0] = tmalloc(double,lag_size[0]+lag_size[1] - +7*(nr+ns) + tot + - work_size(nr,ns,npt_max)); - fd->z[1] = fd->z[0]+nr; - fd->lag_data[0] = fd->z[1]+ns; - fd->lag_data[1] = fd->lag_data[0]+lag_size[0]; - fd->wtend[0] = fd->lag_data[1]+lag_size[1]; - fd->wtend[1] = fd->wtend[0]+6*nr; - fd->sides = fd->wtend[1]+6*ns; - fd->work = fd->sides + tot; - - fd->side_init = 0; - - for(d=0;d<2;++d) { - double *wt=fd->wtend[d]; unsigned nn=fd->n[d]; - lobatto_nodes(fd->z[d],nn); - fd->lag[d] = gll_lag_setup(fd->lag_data[d],nn); - fd->lag[d](wt , fd->lag_data[d],nn,2,-1); - fd->lag[d](wt+3*nn, fd->lag_data[d],nn,2, 1); - - wt[0]=1; for(i=1;iedge[0].x[d] = fd->sides + d *ns, \ - fd->edge[0].dxdn[d] = fd->sides + (2+d)*ns, \ - fd->edge[1].x[d] = fd->sides + (4+d)*ns, \ - fd->edge[1].dxdn[d] = fd->sides + (6+d)*ns; \ - - for(d=0;d<2;++d) - fd->edge[2].x[d] = 0, /* will point to user data */ - fd->edge[2].dxdn[d] = fd->sides + 8*ns + d *nr, - fd->edge[3].x[d] = 0, /* will point to user data */ - fd->edge[3].dxdn[d] = fd->sides + 8*ns + (2+d)*nr; -} - -void findpts_el_free_2(struct findpts_el_data_2 *const fd) -{ - free(fd->p); - free(fd->z[0]); -} - -typedef void compute_edge_data_fun(struct findpts_el_data_2 *fd); - -/* work[2*(nr+ns)] */ -static void compute_edge_data_r(struct findpts_el_data_2 *fd) -{ - const unsigned nr = fd->n[0], ns=fd->n[1], nrsm1 = nr*(ns-1); - unsigned d; - double *work = fd->work, *out = fd->sides + 8*ns; - memcpy(work , fd->wtend[1]+ ns, ns*sizeof(double)); - memcpy(work+ns, fd->wtend[1]+4*ns, ns*sizeof(double)); - for(d=0;d<2;++d) { - tensor_mxm(work+2*ns,nr, fd->x[d],ns, work,2); - memcpy(out+ d *nr, work+2*ns , nr*sizeof(double)); - memcpy(out+(2+d)*nr, work+2*ns+nr , nr*sizeof(double)); - fd->edge[2].x[d] = fd->x[d]; - fd->edge[3].x[d] = fd->x[d] + nrsm1; - } -} - -/* work[4*(nr+ns)] */ -static void compute_edge_data_s(struct findpts_el_data_2 *fd) -{ - const unsigned nr = fd->n[0], ns=fd->n[1]; - unsigned d; - double *work = fd->work, *out = fd->sides; - memcpy(work , fd->wtend[0] , 2*nr*sizeof(double)); - memcpy(work+2*nr, fd->wtend[0]+3*nr, 2*nr*sizeof(double)); - for(d=0;d<2;++d) { - tensor_mtxm(work+4*nr,ns, fd->x[d],nr, work,4); - memcpy(out+ d *ns, work+4*nr , ns*sizeof(double)); - memcpy(out+(2+d)*ns, work+4*nr+ ns, ns*sizeof(double)); - memcpy(out+(4+d)*ns, work+4*nr+2*ns, ns*sizeof(double)); - memcpy(out+(6+d)*ns, work+4*nr+3*ns, ns*sizeof(double)); - } -} - -static const struct findpts_el_gedge_2 *get_edge( - struct findpts_el_data_2 *fd, unsigned ei) -{ - const unsigned mask = 1u<<(ei/2); - if((fd->side_init&mask)==0) { - compute_edge_data_fun *const fun[2] = { - compute_edge_data_s, - compute_edge_data_r - }; - fun[ei/2](fd); - fd->side_init |= mask; - } - return &fd->edge[ei]; -} - -/* work[6*(nr+6)] */ -static void compute_pt_data(struct findpts_el_data_2 *fd) -{ - const unsigned nr = fd->n[0], ns = fd->n[1]; - double *work = fd->work, *work2 = work+6*nr; - unsigned d,i,j; - for(d=0;d<2;++d) { - tensor_mxm(work,nr, fd->x[d],ns, fd->wtend[1],6); - tensor_mtxm(work2,6, fd->wtend[0],nr, work,6); - for(j=0;j<2;++j) for(i=0;i<2;++i) { - fd->pt[2*j+i].x[d] = work2[6*(3*j+0)+(3*i+0)]; - fd->pt[2*j+i].jac[2*d+0] = work2[6*(3*j+0)+(3*i+1)]; - fd->pt[2*j+i].jac[2*d+1] = work2[6*(3*j+1)+(3*i+0)]; - fd->pt[2*j+i].hes[2*d+0] = work2[6*(3*j+0)+(3*i+2)]; - fd->pt[2*j+i].hes[2*d+1] = work2[6*(3*j+2)+(3*i+0)]; - } - } -} - -static const struct findpts_el_gpt_2 *get_pt( - struct findpts_el_data_2 *fd, unsigned pi) -{ - if((fd->side_init&4u)==0) - compute_pt_data(fd), fd->side_init |= 4u; - return &fd->pt[pi]; -} - -/* check reduction in objective against prediction, and adjust - trust region radius (p->tr) accordingly; - may reject the prior step, returning 1; otherwise returns 0 - sets out->dist2, out->index, out->x, out->oldr in any event, - leaving out->r, out->dr, out->flags to be set when returning 0 */ -static int reject_prior_step_q(struct findpts_el_pt_2 *const out, - const double resid[2], - const struct findpts_el_pt_2 *const p, - const double tol) -{ - const double old_dist2 = p->dist2; - const double dist2 = resid[0]*resid[0]+resid[1]*resid[1]; - const double decr = old_dist2-dist2; - const double pred = p->dist2p; - out->x[0]=p->x[0],out->x[1]=p->x[1]; - out->oldr[0]=p->r[0],out->oldr[1]=p->r[1]; - out->index=p->index; - out->dist2=dist2; -#ifdef DIAGNOSTICS_2 - printf("Checking prior step:\n" - " old r = (%.17g,%.17g), old flags = %x\n" - " old_dist2 = %.17g\n" - " r = (%.17g,%.17g), flags = %x\n" - " dist2 = %.17g\n" - " difference = %.17g\n" - " predicted = %.17g\n" - " rho = %.17g\n", - p->oldr[0],p->oldr[1],(p->flags>>5)&FLAG_MASK,old_dist2, - p->r[0],p->r[1],p->flags&FLAG_MASK,dist2, - decr, pred, decr/pred); -#endif - if(decr>= 0.01 * pred) { - if(decr>= 0.9 * pred) { - out->tr = p->tr*2; -#ifdef DIAGNOSTICS_2 - printf(" very good iteration; tr -> %g\n", out->tr); -#endif - } else { -#ifdef DIAGNOSTICS_2 - printf(" good iteration; tr = %g\n", p->tr); -#endif - out->tr = p->tr; - } - return 0; - } else { - /* reject step; note: the point will pass through this routine - again, and we set things up here so it gets classed as a - "very good iteration" --- this doubles the trust radius, - which is why we divide by 4 below */ - double v0 = fabs(p->r[0]-p->oldr[0]), - v1 = fabs(p->r[1]-p->oldr[1]); - out->tr = (v0>v1?v0:v1)/4; -#ifdef DIAGNOSTICS_2 - printf(" bad iteration; tr -> %g\n", out->tr); -#endif - out->dist2=old_dist2; - out->r[0]=p->oldr[0],out->r[1]=p->oldr[1]; - out->flags=p->flags>>5; - out->dist2p=-DBL_MAX; - if(pred < dist2*tol) out->flags|=CONVERGED_FLAG; - return 1; - } -} - -/* minimize ||resid - jac * dr||_2, with |dr| <= tr, |r0+dr|<=1 - (exact solution of trust region problem) */ -static void newton_area(struct findpts_el_pt_2 *const out, - const double jac[4], const double resid[2], - const struct findpts_el_pt_2 *const p, const double tol) -{ - const double tr = p->tr; - double bnd[4] = { -1,1, -1,1 }; - double r0[2]; - double dr[2], fac; - unsigned d, mask, flags; - - r0[0] = p->r[0], r0[1] = p->r[1]; - -#ifdef DIAGNOSTICS_1 - printf("newton_area:\n"); - printf(" resid = (%g,%g); r^T r / 2 = %g\n",resid[0],resid[1], - (resid[0]*resid[0]+resid[1]*resid[1])/2); - printf(" jac = %g\t%g\n" - " %g\t%g\n", - jac[0],jac[1],jac[2],jac[3]); - printf(" r = (%.17g,%.17g)\n",r0[0],r0[1]); -#endif - - mask = 0xfu; - for(d=0;d<2;++d) { - if(r0[d]-tr>-1) bnd[2*d ]=r0[d]-tr, mask^=1u<<(2*d); - if(r0[d]+tr< 1) bnd[2*d+1]=r0[d]+tr, mask^=2u<<(2*d); - } - - lin_solve_2(dr, jac,resid); - -#ifdef DIAGNOSTICS_1 - printf(" min at r = (%.17g,%.17g)\n", r0[0]+dr[0],r0[1]+dr[1]); -#endif - - fac = 1, flags = 0; - for(d=0;d<2;++d) { - double nr = r0[d]+dr[d]; - if((nr-bnd[2*d])*(bnd[2*d+1]-nr)>=0) continue; - if(nr>1, de = plus_1_mod_2(dn); - const double res0 = resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]), - res1 = resid[1]-(jac[2]*dr[0]+jac[3]*dr[1]); - /* y = J_u^T res */ - const double y = jac[de]*res0+jac[2+de]*res1; - /* JtJ = J_u^T J_u */ - const double JtJ = jac[ de]*jac[ de] - +jac[2+de]*jac[2+de]; - const double drc = y/JtJ; - double ffac = 1; - unsigned new_flags = 0; -#ifdef DIAGNOSTICS_1 - printf(" edge %u, de=%u\n",ei,de); - printf(" r=(%.17g,%.17g)\n", r0[0]+dr[0],r0[1]+dr[1]); - printf(" resid = (%g,%g); r^T r / 2 = %g\n",res0,res1, - (res[0]*res[0]+res[1]*res[1])/2); - printf(" min at %.17g\n", r0[de]+dr[de]+drc); -#endif - { - const double rz = r0[de]+dr[de], lb=bnd[2*de],ub=bnd[2*de+1]; - const double nr = r0[de]+(dr[de]+drc); - if((nr-lb)*(ub-nr)<0) { - if(nr>(2*d) & 3u; \ - if(f) dr[d] = bnd[2*d+(f-1)] - r0[d]; \ - } while(0) - SETDR(0); SETDR(1); - #undef SETDR - for(d=0;d<2;++d) { - unsigned c = flags>>(2*d) & 3u; - if(c==0) continue; - else if(dr[d]*y[d]<0) flags &= ~(3u<<(2*d)); -#ifdef DIAGNOSTICS_1 - if( (c==1&&dr[d]>0) || (c==2&&dr[d]<0) ) - printf("FAIL! c=%u, dr[d]=%g\n",c,dr[d]); -#endif - } -#ifdef DIAGNOSTICS_1 - printf(" checking constraints (%x)\n",old_flags); - printf(" r=(%.17g,%.17g)\n", r0[0]+dr[0],r0[1]+dr[1]); - printf(" resid = (%g,%g); r^T r / 2 = %g\n",res[0],res[1], - (res[0]*res[0]+res[1]*res[1])/2); - printf(" relaxed %x -> %x\n",old_flags,flags); -#endif - if(flags==old_flags) goto newton_area_fin; - switch(num_constrained(flags)) { - case 1: goto newton_area_edge; - } - } - -newton_area_fin: -#ifdef DIAGNOSTICS_1 - { - const double res[2]={ resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]), - resid[1]-(jac[2]*dr[0]+jac[3]*dr[1]) }; - printf(" r=(%.17g,%.17g)\n", r0[0]+dr[0],r0[1]+dr[1]); - printf(" resid = (%g,%g); r^T r / 2 = %g\n",res[0],res[1], - (res[0]*res[0]+res[1]*res[1])/2); - } -#endif - flags &= mask; - if(fabs(dr[0])+fabs(dr[1]) < tol) flags |= CONVERGED_FLAG; - { - const double res0 = resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]), - res1 = resid[1]-(jac[2]*dr[0]+jac[3]*dr[1]); - out->dist2p=resid[0]*resid[0]+resid[1]*resid[1] - -(res0*res0+res1*res1); - } - #define SETR(d) do { \ - unsigned f = flags>>(2*d) & 3u; \ - out->r[d] = f==0 ? r0[d]+dr[d] : ( f==1 ? -1 : 1 ); \ - } while(0) - SETR(0); SETR(1); - #undef SETR - out->flags = flags | (p->flags<<5); -} - -static void newton_edge(struct findpts_el_pt_2 *const out, - const double jac[4], const double rhes, const double resid[2], - const unsigned de, const unsigned dn, - unsigned flags, - const struct findpts_el_pt_2 *const p, const double tol) -{ - const double tr = p->tr; - /* A = J^T J - resid_d H_d */ - const double A = jac[ de]*jac[ de] - +jac[2+de]*jac[2+de] - rhes; - /* y = J^T r */ - const double y = jac[ de]*resid[0] - +jac[2+de]*resid[1]; - - const double oldr = p->r[de]; - double dr,nr,tdr,tnr; - double v,tv; unsigned new_flags=0, tnew_flags=0; - -#ifdef DIAGNOSTICS_1 - printf("Newton edge %u (dn=%u) flags=%x\n",de,dn,flags); - printf(" A=%g, y=%g\n",A,y); - if(A<=0) printf(" A not positive\n"); - printf(" r=(%.17g,%.17g)\n",p->r[0],p->r[1]); -#endif - - #define EVAL(dr) (dr*A-2*y)*dr - - /* if A is not SPD, quadratic model has no minimum */ - if(A>0) { - dr = y/A, nr = oldr+dr; - if(fabs(dr)-1) dr=-tr; - else nr=-1, dr=-1-oldr, new_flags = flags | 1u<<(2*de); - v =EVAL( dr); - - if((tnr=oldr+tr)< 1) tdr=tr; - else tnr= 1, tdr= 1-oldr, tnew_flags = flags | 2u<<(2*de); - tv=EVAL(tdr); - - if(tvr[de]=nr; - out->r[dn]=p->r[dn]; - out->dist2p = -v; - out->flags = flags | new_flags | (p->flags<<5); -#ifdef DIAGNOSTICS_1 - printf(" new r = (%.17g,%.17g)\n",out->r[0],out->r[1]); -#endif -} - -typedef void findpt_fun( - struct findpts_el_pt_2 *const out, - struct findpts_el_data_2 *const fd, - const struct findpts_el_pt_2 *const p, const unsigned pn, const double tol); - -/* work[(6+2*(2*nr+ns))*pn] */ -static void findpt_area( - struct findpts_el_pt_2 *const out, - struct findpts_el_data_2 *const fd, - const struct findpts_el_pt_2 *const p, const unsigned pn, const double tol) -{ - const unsigned nr=fd->n[0],ns=fd->n[1]; - double *const resid = fd->work, *const jac = resid + 2*pn, - *const wtr = jac+4*pn, *const wts = wtr+2*nr*pn, - *const slice = wts+2*ns*pn; - unsigned i; unsigned d; - /* evaluate x(r) and jacobian */ - for(i=0;ilag[0](wtr+2*i*nr, fd->lag_data[0], nr, 1, p[i].r[0]); - for(i=0;ilag[1](wts+2*i*ns, fd->lag_data[1], ns, 1, p[i].r[1]); - for(d=0;d<2;++d) { - tensor_mxm(slice,nr, fd->x[d],ns, wts,2*pn); - for(i=0;iflags & FLAG_MASK; - const unsigned ei = edge_index(pflag); - const unsigned dn = ei>>1, de = plus_1_mod_2(dn); - const unsigned n = fd->n[de]; - double *const resid=fd->work, *const jac=resid+2*pn, *const hes=jac+4*pn, - *const wt = hes+pn, *const slice = wt+3*n*pn; - const struct findpts_el_gedge_2 *const edge = get_edge(fd,ei); - unsigned i; unsigned d; - -#ifdef DIAGNOSTICS_1 - printf("Edge %u\n",ei); - printf(" pflag = %u\n",pflag); - printf(" ei = %u\n",ei); - printf(" dn, de = %u, %u\n",dn,de); - printf(" n = %u \n", n); -#endif - - /* evaluate x(r), jacobian, hessian */ - for(i=0;ilag[de](wt+3*i*n, fd->lag_data[de], n, 2, p[i].r[de]); - for(i=0;ix[d],n); - for(i=0;idxdn[d],n); - for(i=0;iflags & FLAG_MASK; - const unsigned pi = point_index(pflag); - const struct findpts_el_gpt_2 *gpt = get_pt(fd,pi); - const double *const x = gpt->x, *const jac = gpt->jac, *const hes = gpt->hes; - unsigned i; - -#ifdef DIAGNOSTICS_1 - printf("Point %u\n",pi); - printf(" pflag = %u\n",pflag); - printf(" pi = %u\n",pi); -#endif - - for(i=0;in[0], ns=fd->n[1]; - unsigned i,j, ii=0; - for(p=pt;p!=pe;++p) p->dist2=DBL_MAX; - for(j=0;jz[1][j]; - for(i=0;iz[0][i]; - const double x=fd->x[0][ii], y=fd->x[1][ii]; - ++ii; - for(p=pt;p!=pe;++p) { - const double dx=p->x[0]-x,dy=p->x[1]-y; - const double dist2 = dx*dx+dy*dy; - if(p->dist2<=dist2) continue; - p->dist2=dist2; - p->r[0]=zr, p->r[1]=zs; - } - } - } -} - -void findpts_el_2(struct findpts_el_data_2 *const fd, const unsigned npt, - const double tol) -{ - findpt_fun *const fun[3] = - { &findpt_area, &findpt_edge, &findpt_pt }; - struct findpts_el_pt_2 *const pbuf = fd->p, *const pstart = fd->p + npt; - unsigned nconv = npt; - unsigned step = 0; - unsigned count[9] = { 0,0,0, 0,0,0, 0,0,0 } ; - count[0]=npt; - seed(fd,pbuf,npt); - { unsigned i; - for(i=0;i1 - { unsigned i; - printf("findpts_el_2 Newton step (%u), %u unconverged:\n ", step,nconv); - for(i=0;i<9;++i) printf(" %u",count[i]); - printf("\n"); - } -#endif - - for(p=pstart,pout=pbuf; p!=pe; p+=pn,pout+=pn) { - const unsigned pflags = p->flags & FLAG_MASK; - pn = count[pt_flags_to_bin_noC(pflags)]; - fun[num_constrained(pflags)](pout, fd, p,pn, tol); - } - /* group points by contsraints */ - { - unsigned offset[10] = { 0,0,0, 0,0,0, 0,0,0, 0 }; - struct findpts_el_pt_2 *const ppe = pbuf+nconv; - for(pout=pbuf; pout!=ppe; ++pout) - ++offset[pt_flags_to_bin(pout->flags & FLAG_MASK)]; - { - unsigned i; unsigned sum=0; - for(i=0;i<9;++i) { - unsigned ci=offset[i]; count[i]=ci, offset[i]=sum, sum+=ci; - } - nconv = offset[9] = sum; /* last bin is converged; forget it */ - } - for(pout=pbuf; pout!=ppe; ++pout) - pstart[offset[pt_flags_to_bin(pout->flags & FLAG_MASK)]++] = *pout; - } - } - { struct findpts_el_pt_2 *p, *const pe=pstart+npt; - for(p=pstart;p!=pe;++p) - pbuf[p->index]=*p, pbuf[p->index].flags&=FLAG_MASK; - } -#if DIAGNOSTICS_ITERATIONS - printf("findpts_el_2 took %u steps\n ", step); -#endif -} - -void findpts_el_eval_2( - double *const out_base, const unsigned out_stride, - const double *const r_base, const unsigned r_stride, const unsigned pn, - const double *const in, struct findpts_el_data_2 *const fd) -{ - const unsigned nr=fd->n[0],ns=fd->n[1]; - double *const wtr = fd->work, *const wts = wtr+nr*pn, - *const slice = wts+ns*pn; - unsigned i; const double *r; double *out; - for(i=0,r=r_base;ilag[0](wtr+i*nr, fd->lag_data[0], nr, 0, r[0]); - fd->lag[1](wts+i*ns, fd->lag_data[1], ns, 0, r[1]); - r = (const double*)((const char*)r + r_stride); - } - - tensor_mxm(slice,nr, in,ns, wts,pn); - for(i=0,out=out_base;i -#include -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#include "mem.h" -#include "tensor.h" -#include "poly.h" - -#define findpts_el_setup_3 PREFIXED_NAME(findpts_el_setup_3) -#define findpts_el_free_3 PREFIXED_NAME(findpts_el_free_3 ) -#define findpts_el_3 PREFIXED_NAME(findpts_el_3 ) -#define findpts_el_eval_3 PREFIXED_NAME(findpts_el_eval_3 ) -/* -#define DIAGNOSTICS_1 -#define DIAGNOSTICS_2 -*/ -#define DIAGNOSTICS_ITERATIONS 0 - -#if defined(DIAGNOSTICS_1) || defined(DIAGNOSTICS_2) \ - || DIAGNOSTICS_ITERATIONS > 0 -#include -#endif - -/* A is row-major */ -static void lin_solve_3(double x[3], const double A[9], const double y[3]) -{ - const double a = A[4]*A[8]-A[5]*A[7], - b = A[5]*A[6]-A[3]*A[8], - c = A[3]*A[7]-A[4]*A[6], - idet = 1/(A[0]*a+A[1]*b+A[2]*c); - const double - inv0 = a, - inv1 = A[2]*A[7]-A[1]*A[8], - inv2 = A[1]*A[5]-A[2]*A[4], - inv3 = b, - inv4 = A[0]*A[8]-A[2]*A[6], - inv5 = A[2]*A[3]-A[0]*A[5], - inv6 = c, - inv7 = A[1]*A[6]-A[0]*A[7], - inv8 = A[0]*A[4]-A[1]*A[3]; - x[0] = idet*(inv0*y[0] + inv1*y[1] + inv2*y[2]); - x[1] = idet*(inv3*y[0] + inv4*y[1] + inv5*y[2]); - x[2] = idet*(inv6*y[0] + inv7*y[1] + inv8*y[2]); -} - -static void lin_solve_sym_2(double x[2], const double A[3], const double y[2]) -{ - const double idet = 1/(A[0]*A[2] - A[1]*A[1]); - x[0] = idet*(A[2]*y[0] - A[1]*y[1]); - x[1] = idet*(A[0]*y[1] - A[1]*y[0]); -} - - -struct findpts_el_pt_3 { - double x[3],r[3],oldr[3],dist2,dist2p,tr; - unsigned index,flags; -}; - -/* the bit structure of flags is CTTSSRR - the C bit --- 1<<6 --- is set when the point is converged - RR is 0 = 00b if r is unconstrained, - 1 = 01b if r is constrained at -1 - 2 = 10b if r is constrained at +1 - SS, TT are similarly for s and t constraints -*/ - -#define CONVERGED_FLAG (1u<<6) -#define FLAG_MASK 0x7fu - -static unsigned num_constrained(const unsigned flags) -{ - const unsigned y = flags | flags>>1; - return (y&1u) + (y>>2 & 1u) + (y>>4 & 1u); -} - -static unsigned pt_flags_to_bin_noC(const unsigned flags) -{ - return ((flags>>4 & 3u)*3 + (flags>>2 & 3u))*3 + (flags & 3u); -} - -/* map flags to 27 if the C bit is set, - else to [0,26] --- the 27 valid configs of TTSSRR */ -static unsigned pt_flags_to_bin(const unsigned flags) -{ - const unsigned mask = 0u - (flags>>6); /* 0 or 0xfff... when converged */ - return (mask & 27u) | (~mask & pt_flags_to_bin_noC(flags)); -} - -/* assumes x = 0, 1, or 2 */ -static unsigned plus_1_mod_3(const unsigned x) { return ((x | x>>1)+1) & 3u; } -static unsigned plus_2_mod_3(const unsigned x) -{ - const unsigned y = (x-1) & 3u; - return y ^ (y>>1); -} - -/* assumes x = 1 << i, with i < 6, returns i+1 */ -static unsigned which_bit(const unsigned x) -{ - const unsigned y = x&7u; - return (y-(y>>2)) | ((x-1)&4u) | (x>>4); -} - -static unsigned face_index(const unsigned x) { return which_bit(x)-1; } - -static unsigned edge_index(const unsigned x) -{ - const unsigned y = ~((x>>1) | x); - const unsigned RTSR = ((x>>1)&1u) | ((x>>2)&2u) | ((x>>3)&4u) | ((x<<2)&8u); - const unsigned re = RTSR>>1; - const unsigned se = 4u | RTSR>>2; - const unsigned te = 8u | (RTSR&3u); - return ( (0u - ( y &1u)) & re ) - | ( (0u - ((y>>2)&1u)) & se ) - | ( (0u - ((y>>4)&1u)) & te ); -} - -static unsigned point_index(const unsigned x) -{ - return ((x>>1)&1u) | ((x>>2)&2u) | ((x>>3)&4u); -} - -/* extra data - - we need x, dx/dn for each face - rs: x at 0, nrst - nrs, - 6*nrs extra for dx/dn - st: 12*nst extra - tr: 12*ntr extra - (transposed order for embedded t-edges) - - for each edge, - have x, dx/dn2 already as part of face data - need dx/dn1 (strided in face data) - need d^2x/dn1^2, d^2x/dn2^2 possibly, if constraints relax - thats 3*4*(nr+ns+nt) extra - -*/ - -struct findpts_el_gface_3 { const double *x[3], *dxdn[3]; }; -struct findpts_el_gedge_3 { const double *x[3], *dxdn1[3], *dxdn2[3], - *d2xdn1[3], *d2xdn2[3]; }; -struct findpts_el_gpt_3 { double x[3], jac[9], hes[18]; }; - -struct findpts_el_data_3 { - unsigned npt_max; - struct findpts_el_pt_3 *p; - - unsigned n[3]; - double *z[3]; - lagrange_fun *lag[3]; - double *lag_data[3]; - double *wtend[3]; - - const double *x[3]; - - unsigned side_init; - double *sides; - struct findpts_el_gface_3 face[6]; /* ST R=-1,R=+1; TR S=-1,S=+1; ... */ - struct findpts_el_gedge_3 edge[12]; /* R S=-1,T=-1; R S=1,T=-1; ... */ - struct findpts_el_gpt_3 pt[8]; - - double *work; -}; - -/* work[2*nt+2*nrs] */ -/* work[4*nr+4*nst] */ -/* work[4*ns+4*nr] */ -/* work[4*n1+4*n], work[2*n2+2*n] */ -/* work[4*nr+4], work[2*nt+2] */ -/* work[(3+9+2*(nr+ns+nt+nrs))*pn + max(2*nr,ns) ] */ -/* work[(3+9+3+3*(n1+n2+n1))*pn ] */ -/* work[ 3*n ] */ -static unsigned work_size( - const unsigned nr, const unsigned ns, const unsigned nt, - const unsigned npt_max) -{ - unsigned n1, n2, wsize; - if(nr>ns) { - if(nr>nt) n1=nr, n2 = (ns>nt ? ns : nt); - else n1=nt, n2 = nr; - } else { - if(ns>nt) n1=ns, n2 = (nr>nt ? nr : nt); - else n1=nt, n2 = ns; - } - #define DO_MAX(x) do { const unsigned temp=(x); \ - wsize=temp>wsize?temp:wsize; } while(0) - wsize = (12 + 2*(nr+ns+nt+nr*ns)) * npt_max + (2*nr>ns?2*nr:ns); - DO_MAX(2*(nt+nr*ns)); - DO_MAX(4*(nr+ns*nt)); - DO_MAX(4*(n1+n2)); - DO_MAX(npt_max*(15+3*(2*n1+n2))); - #undef DO_MAX - return wsize; -} - -void findpts_el_setup_3(struct findpts_el_data_3 *const fd, - const unsigned n[3], - const unsigned npt_max) -{ - const unsigned nr=n[0], ns=n[1], nt=n[2]; - const unsigned nrs = nr*ns, nst=ns*nt, ntr=nt*nr; - const unsigned face_size = 12*nst + 12*ntr + 6*nrs; - const unsigned off_es = face_size + 36*nr, off_et = off_es + 36*ns, - tot = off_et + 36*nt; - unsigned d,i, lag_size[3]; - - fd->npt_max = npt_max; - fd->p = tmalloc(struct findpts_el_pt_3, npt_max*2); - - fd->n[0]=nr, fd->n[1]=ns, fd->n[2]=nt; - for(d=0;d<3;++d) lag_size[d] = gll_lag_size(fd->n[d]); - - fd->z[0] = tmalloc(double,lag_size[0]+lag_size[1]+lag_size[2] - +7*(nr+ns+nt) + tot + - work_size(nr,ns,nt,npt_max)); - fd->z[1] = fd->z[0]+nr; - fd->z[2] = fd->z[1]+ns; - fd->lag_data[0] = fd->z[2]+nt; - fd->lag_data[1] = fd->lag_data[0]+lag_size[0]; - fd->lag_data[2] = fd->lag_data[1]+lag_size[1]; - fd->wtend[0] = fd->lag_data[2]+lag_size[2]; - fd->wtend[1] = fd->wtend[0]+6*nr; - fd->wtend[2] = fd->wtend[1]+6*ns; - fd->sides = fd->wtend[2]+6*nt; - fd->work = fd->sides + tot; - - fd->side_init = 0; - - for(d=0;d<3;++d) { - double *wt=fd->wtend[d]; unsigned nn=fd->n[d]; - lobatto_nodes(fd->z[d],nn); - fd->lag[d] = gll_lag_setup(fd->lag_data[d],nn); - fd->lag[d](wt , fd->lag_data[d],nn,2,-1); - fd->lag[d](wt+3*nn, fd->lag_data[d],nn,2, 1); - - wt[0]=1; for(i=1;iface[2*i ].x[d] = fd->sides + base + d *n, \ - fd->face[2*i ].dxdn[d] = fd->sides + base + (3+d)*n, \ - fd->face[2*i+1].x[d] = fd->sides + base + (6+d)*n, \ - fd->face[2*i+1].dxdn[d] = fd->sides + base + (9+d)*n; \ - } while(0) - SET_FACE(0,0,nst); - SET_FACE(1,12*nst,ntr); - #undef SET_FACE - - for(d=0;d<3;++d) - fd->face[4].x[d] = 0, /* will point to user data */ - fd->face[4].dxdn[d] = fd->sides + 12*(nst+ntr) + d*nrs, - fd->face[5].x[d] = 0, /* will point to user data */ - fd->face[5].dxdn[d] = fd->sides + 12*(nst+ntr) + (3+d)*nrs; - - #define SET_EDGE1(j,k,d,rd,rn,base) \ - for(i=0;i<2;++i) \ - fd->edge[4*j+2*i+0].dxdn2[d] = fd->face[2*k+i].dxdn[d], \ - fd->edge[4*j+2*i+1].dxdn2[d] = fd->face[2*k+i].dxdn[d]+n##rd##rn-n##rd; - #define SET_EDGE2(j,d,rd,rn,base) \ - for(i=0;i<4;++i) \ - fd->edge[4*j+i].dxdn1 [d] = fd->sides + base + (9*i +d)*n##rd, \ - fd->edge[4*j+i].d2xdn1[d] = fd->sides + base + (9*i+3+d)*n##rd, \ - fd->edge[4*j+i].d2xdn2[d] = fd->sides + base + (9*i+6+d)*n##rd; - #define SET_EDGE(j,rd,rn,base) do { \ - for(d=0;d<3;++d) { SET_EDGE1(j,plus_2_mod_3(j),d,rd,rn,base); \ - SET_EDGE2(j,d,rd,rn,base); } \ - } while(0) - SET_EDGE(0,r,s,face_size); - SET_EDGE(1,s,t,off_es); - SET_EDGE(2,t,r,off_et); - #undef SET_EDGE - #undef SET_EDGE2 - #undef SET_EDGE1 -} - -void findpts_el_free_3(struct findpts_el_data_3 *const fd) -{ - free(fd->p); - free(fd->z[0]); -} - -typedef void compute_face_data_fun(struct findpts_el_data_3 *fd); - -/* work[2*nt+2*nrs] */ -static void compute_face_data_rs(struct findpts_el_data_3 *fd) -{ - const unsigned nr = fd->n[0], ns=fd->n[1], nt=fd->n[2], - nrs = nr*ns, nst=ns*nt, ntr = nt*nr, nrstm1 = nrs*(nt-1); - unsigned d; - double *work = fd->work, *out = fd->sides + 12*(nst+ntr); - memcpy(work , fd->wtend[2]+ nt, nt*sizeof(double)); - memcpy(work+nt, fd->wtend[2]+4*nt, nt*sizeof(double)); - for(d=0;d<3;++d) { - tensor_mxm(work+2*nt,nrs, fd->x[d],nt, work,2); - memcpy(out+ d *nrs, work+2*nt , nrs*sizeof(double)); - memcpy(out+(3+d)*nrs, work+2*nt+nrs , nrs*sizeof(double)); - fd->face[4].x[d] = fd->x[d]; - fd->face[5].x[d] = fd->x[d] + nrstm1; - } -} - -/* work[4*nr+4*nst] */ -static void compute_face_data_st(struct findpts_el_data_3 *fd) -{ - const unsigned nr = fd->n[0], ns=fd->n[1], nt=fd->n[2], nst=ns*nt; - unsigned i; - double *work = fd->work, *out = fd->sides; - memcpy(work , fd->wtend[0] , 2*nr*sizeof(double)); - memcpy(work+2*nr, fd->wtend[0]+3*nr, 2*nr*sizeof(double)); - for(i=0;i<3;++i) { - tensor_mtxm(work+4*nr,nst, fd->x[i],nr, work,4); - memcpy(out+ i *nst, work+4*nr , nst*sizeof(double)); - memcpy(out+(3+i)*nst, work+4*nr+ nst, nst*sizeof(double)); - memcpy(out+(6+i)*nst, work+4*nr+2*nst, nst*sizeof(double)); - memcpy(out+(9+i)*nst, work+4*nr+3*nst, nst*sizeof(double)); - } -} - -/* work[4*ns+4*nr] */ -static void compute_face_data_tr(struct findpts_el_data_3 *fd) -{ - const unsigned nr = fd->n[0], ns=fd->n[1], nt=fd->n[2], - nrs = nr*ns, nst=ns*nt, ntr=nt*nr; - unsigned i,k,d; - double *work = fd->work, *out = fd->sides + 12*nst; - memcpy(work , fd->wtend[1] , 2*ns*sizeof(double)); - memcpy(work+2*ns, fd->wtend[1]+3*ns, 2*ns*sizeof(double)); - for(d=0;d<3;++d) { - for(k=0;kx[d]+k*nrs,ns, work,4); - for(outk=out+ d *ntr+k,i=0;iside_init&mask)==0) { - compute_face_data_fun *const fun[3] = { - compute_face_data_st, - compute_face_data_tr, - compute_face_data_rs - }; - fun[fi/2](fd); - fd->side_init |= mask; - } - return &fd->face[fi]; -} - -/* work[4*n1+4*n], work[2*n2+2*n] */ -static void compute_edge_data(struct findpts_el_data_3 *fd, unsigned d) -{ - const unsigned dn1 = plus_1_mod_3(d), dn2 = plus_2_mod_3(d); - const unsigned n = fd->n[d], n1 = fd->n[dn1], n2 = fd->n[dn2]; - const unsigned nr=fd->n[0],ns=fd->n[1],nt=fd->n[2], - nrs=nr*ns,nst=ns*nt,ntr=nt*nr; - const unsigned base = 6*nrs + 12*nst + 12*ntr - + (d>0 ? 36*nr : 0) + (d>1 ? 36*ns : 0); - #define DXDN1(i,d) (fd->sides+base+(9*(i) +(d))*n) - #define D2XDN1(i,d) (fd->sides+base+(9*(i)+3+(d))*n) - #define D2XDN2(i,d) (fd->sides+base+(9*(i)+6+(d))*n) - const struct findpts_el_gface_3 *face_d_n1 = get_face(fd,2*dn2), - *face_n2_d = get_face(fd,2*dn1); - struct findpts_el_gedge_3 *e = fd->edge + 4*d; - unsigned i,xd; - double *work = fd->work; - for(xd=0;xd<3;++xd) for(i=0;i<2;++i) - e[2*i ].x[xd] = face_d_n1[i].x[xd], - e[2*i+1].x[xd] = face_d_n1[i].x[xd]+n*(n1-1); - memcpy(work , fd->wtend[dn1]+ n1,2*n1*sizeof(double)); - memcpy(work+2*n1, fd->wtend[dn1]+4*n1,2*n1*sizeof(double)); - for(i=0;i<2;++i) for(xd=0;xd<3;++xd) { - tensor_mxm(work+4*n1,n, face_d_n1[i].x[xd],n1, work,4); - memcpy( DXDN1(2*i+0,xd), work+4*n1 , n*sizeof(double)); - memcpy(D2XDN1(2*i+0,xd), work+4*n1+ n, n*sizeof(double)); - memcpy( DXDN1(2*i+1,xd), work+4*n1+2*n, n*sizeof(double)); - memcpy(D2XDN1(2*i+1,xd), work+4*n1+3*n, n*sizeof(double)); - } - memcpy(work , fd->wtend[dn2]+2*n2,n2*sizeof(double)); - memcpy(work+n2, fd->wtend[dn2]+5*n2,n2*sizeof(double)); - for(i=0;i<2;++i) for(xd=0;xd<3;++xd) { - tensor_mtxm(work+2*n2,n, face_n2_d[i].x[xd],n2, work,2); - memcpy(D2XDN2( i,xd), work+2*n2 , n*sizeof(double)); - memcpy(D2XDN2(2+i,xd), work+2*n2+n, n*sizeof(double)); - } - #undef D2XDN2 - #undef D2XDN1 - #undef DXDN1 -} - -static const struct findpts_el_gedge_3 *get_edge( - struct findpts_el_data_3 *fd, unsigned ei) -{ - const unsigned mask = 8u<<(ei/4); - if((fd->side_init&mask)==0) - compute_edge_data(fd,ei/4), fd->side_init |= mask; - return &fd->edge[ei]; -} - -/* work[4*nr+4], work[2*nt+2] */ -static void compute_pt_data(struct findpts_el_data_3 *fd) -{ - const unsigned nr = fd->n[0], nt = fd->n[2]; - const struct findpts_el_gedge_3 *e = get_edge(fd,0); - unsigned d,i; - double *work = fd->work; - for(i=0;i<4;++i) for(d=0;d<3;++d) - fd->pt[2*i ].x[d] = e[i].x[d][0], - fd->pt[2*i ].jac[3*d+1] = e[i].dxdn1[d][0], - fd->pt[2*i ].jac[3*d+2] = e[i].dxdn2[d][0], - fd->pt[2*i ].hes[6*d+3] = e[i].d2xdn1[d][0], - fd->pt[2*i ].hes[6*d+5] = e[i].d2xdn2[d][0], - fd->pt[2*i+1].x[d] = e[i].x[d][nr-1], - fd->pt[2*i+1].jac[3*d+1] = e[i].dxdn1[d][nr-1], - fd->pt[2*i+1].jac[3*d+2] = e[i].dxdn2[d][nr-1], - fd->pt[2*i+1].hes[6*d+3] = e[i].d2xdn1[d][nr-1], - fd->pt[2*i+1].hes[6*d+5] = e[i].d2xdn2[d][nr-1]; - memcpy(work , fd->wtend[0]+ nr, 2*nr*sizeof(double)); - memcpy(work+2*nr, fd->wtend[0]+4*nr, 2*nr*sizeof(double)); - for(i=0;i<4;++i) for(d=0;d<3;++d) { - tensor_mtxv(work+4*nr,4, work, e[i].x[d],nr); - fd->pt[2*i ].jac[3*d ] = work[4*nr ]; - fd->pt[2*i ].hes[6*d ] = work[4*nr+1]; - fd->pt[2*i+1].jac[3*d ] = work[4*nr+2]; - fd->pt[2*i+1].hes[6*d ] = work[4*nr+3]; - } - memcpy(work+nr,work+2*nr,nr*sizeof(double)); - for(i=0;i<4;++i) for(d=0;d<3;++d) { - tensor_mtxv(work+2*nr,2, work, e[i].dxdn1[d],nr); - fd->pt[2*i ].hes[6*d+1] = work[2*nr ]; - fd->pt[2*i+1].hes[6*d+1] = work[2*nr+1]; - tensor_mtxv(work+2*nr,2, work, e[i].dxdn2[d],nr); - fd->pt[2*i ].hes[6*d+2] = work[2*nr ]; - fd->pt[2*i+1].hes[6*d+2] = work[2*nr+1]; - } - e = get_edge(fd,8); - memcpy(work , fd->wtend[2]+ nt, nt*sizeof(double)); - memcpy(work+nt, fd->wtend[2]+4*nt, nt*sizeof(double)); - for(i=0;i<4;++i) for(d=0;d<3;++d) { - tensor_mtxv(work+2*nt,2, work, e[i].dxdn2[d],nt); - fd->pt[ i].hes[6*d+4] = work[2*nt ]; - fd->pt[4+i].hes[6*d+4] = work[2*nt+1]; - } -} - -static const struct findpts_el_gpt_3 *get_pt( - struct findpts_el_data_3 *fd, unsigned pi) -{ - if((fd->side_init&0x40u)==0) - compute_pt_data(fd), fd->side_init |= 0x40u; - return &fd->pt[pi]; -} - -/* check reduction in objective against prediction, and adjust - trust region radius (p->tr) accordingly; - may reject the prior step, returning 1; otherwise returns 0 - sets out->dist2, out->index, out->x, out->oldr in any event, - leaving out->r, out->dr, out->flags to be set when returning 0 */ -static int reject_prior_step_q(struct findpts_el_pt_3 *const out, - const double resid[3], - const struct findpts_el_pt_3 *const p, - const double tol) -{ - const double old_dist2 = p->dist2; - const double dist2 = resid[0]*resid[0]+resid[1]*resid[1]+resid[2]*resid[2]; - const double decr = old_dist2-dist2; - const double pred = p->dist2p; - out->x[0]=p->x[0],out->x[1]=p->x[1],out->x[2]=p->x[2]; - out->oldr[0]=p->r[0],out->oldr[1]=p->r[1],out->oldr[2]=p->r[2]; - out->index=p->index; - out->dist2=dist2; -#ifdef DIAGNOSTICS_2 - printf("Checking prior step:\n" - " old r = (%.17g,%.17g,%.17g), old flags = %x\n" - " old_dist2 = %.17g\n" - " r = (%.17g,%.17g,%.17g), flags = %x\n" - " dist2 = %.17g\n" - " difference = %.17g\n" - " predicted = %.17g\n" - " rho = %.17g\n", - p->oldr[0],p->oldr[1],p->oldr[2],(p->flags>>7)&FLAG_MASK,old_dist2, - p->r[0],p->r[1],p->r[2],p->flags&FLAG_MASK,dist2, - decr, pred, decr/pred); -#endif - if(decr>= 0.01 * pred) { - if(decr>= 0.9 * pred) { - out->tr = p->tr*2; -#ifdef DIAGNOSTICS_2 - printf(" very good iteration; tr -> %g\n", out->tr); -#endif - } else { -#ifdef DIAGNOSTICS_2 - printf(" good iteration; tr = %g\n", p->tr); -#endif - out->tr = p->tr; - } - return 0; - } else { - /* reject step; note: the point will pass through this routine - again, and we set things up here so it gets classed as a - "very good iteration" --- this doubles the trust radius, - which is why we divide by 4 below */ - double v0 = fabs(p->r[0]-p->oldr[0]), - v1 = fabs(p->r[1]-p->oldr[1]), - v2 = fabs(p->r[2]-p->oldr[2]); - out->tr = (v1>v2?(v0>v1?v0:v1):(v0>v2?v0:v2))/4; -#ifdef DIAGNOSTICS_2 - printf(" bad iteration; tr -> %g\n", out->tr); -#endif - out->dist2=old_dist2; - out->r[0]=p->oldr[0],out->r[1]=p->oldr[1],out->r[2]=p->oldr[2]; - out->flags=p->flags>>7; - out->dist2p=-DBL_MAX; - if(pred < dist2*tol) out->flags|=CONVERGED_FLAG; - return 1; - } -} - -/* minimize ||resid - jac * dr||_2, with |dr| <= tr, |r0+dr|<=1 - (exact solution of trust region problem) */ -static void newton_vol(struct findpts_el_pt_3 *const out, - const double jac[9], const double resid[3], - const struct findpts_el_pt_3 *const p, const double tol) -{ - const double tr = p->tr; - double bnd[6] = { -1,1, -1,1, -1,1 }; - double r0[3]; - double dr[3], fac; - unsigned d, mask, flags; - r0[0]=p->r[0],r0[1]=p->r[1],r0[2]=p->r[2]; -#ifdef DIAGNOSTICS_1 - printf("newton_vol:\n"); - printf(" resid = (%g,%g,%g); r^T r / 2 = %g\n",resid[0],resid[1],resid[2], - (resid[0]*resid[0]+resid[1]*resid[1]+resid[2]*resid[2])/2); - printf(" jac = %g\t%g\t%g\n" - " %g\t%g\t%g\n" - " %g\t%g\t%g\n", - jac[0],jac[1],jac[2],jac[3],jac[4],jac[5],jac[6],jac[7],jac[8]); - printf(" r = (%.15g,%.15g,%.15g)\n",r0[0],r0[1],r0[2]); -#endif - - mask = 0x3fu; - for(d=0;d<3;++d) { - if(r0[d]-tr>-1) bnd[2*d ]=r0[d]-tr, mask^=1u<<(2*d); - if(r0[d]+tr< 1) bnd[2*d+1]=r0[d]+tr, mask^=2u<<(2*d); - } - - lin_solve_3(dr, jac,resid); - -#ifdef DIAGNOSTICS_1 - printf(" min at r = (%.17g,%.17g,%.17g)\n", - r0[0]+dr[0],r0[1]+dr[1],r0[2]+dr[2]); -#endif - - fac = 1, flags = 0; - for(d=0;d<3;++d) { - double nr = r0[d]+dr[d]; - if((nr-bnd[2*d])*(bnd[2*d+1]-nr)>=0) continue; - if(nr>1, d1 = plus_1_mod_3(dn), d2 = plus_2_mod_3(dn); - double drc[2], ffac=1; - unsigned new_flags=0; - double res[3], y[2], JtJ[3]; - res[0] = resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]+jac[2]*dr[2]), - res[1] = resid[1]-(jac[3]*dr[0]+jac[4]*dr[1]+jac[5]*dr[2]), - res[2] = resid[2]-(jac[6]*dr[0]+jac[7]*dr[1]+jac[8]*dr[2]); - /* y = J_u^T res */ - y[0] = jac[d1]*res[0]+jac[3+d1]*res[1]+jac[6+d1]*res[2], - y[1] = jac[d2]*res[0]+jac[3+d2]*res[1]+jac[6+d2]*res[2]; - /* JtJ = J_u^T J_u */ - JtJ[0] = jac[ d1]*jac[ d1] - +jac[3+d1]*jac[3+d1] - +jac[6+d1]*jac[6+d1], - JtJ[1] = jac[ d1]*jac[ d2] - +jac[3+d1]*jac[3+d2] - +jac[6+d1]*jac[6+d2], - JtJ[2] = jac[ d2]*jac[ d2] - +jac[3+d2]*jac[3+d2] - +jac[6+d2]*jac[6+d2]; - lin_solve_sym_2(drc, JtJ,y); -#ifdef DIAGNOSTICS_1 - printf(" face %u, dn=%u, (d1,d2)=(%u,%u)\n",fi,dn,d1,d2); - printf(" r=(%.17g,%.17g,%.17g)\n", r0[0]+dr[0],r0[1]+dr[1],r0[2]+dr[2]); - printf(" resid = (%g,%g,%g); r^T r / 2 = %g\n",res[0],res[1],res[2], - (res[0]*res[0]+res[1]*res[1]+res[2]*res[2])/2); - printf(" min at (%.17g,%.17g)\n", - r0[d1]+dr[d1]+drc[0],r0[d2]+dr[d2]+drc[1]); -#endif - #define CHECK_CONSTRAINT(drcd,d3) do { \ - const double rz = r0[d3]+dr[d3], lb=bnd[2*d3],ub=bnd[2*d3+1]; \ - const double delta=drcd, nr = r0[d3]+(dr[d3]+delta); \ - if((nr-lb)*(ub-nr)<0) { \ - if(nr>2; - double ffac = 1; - unsigned new_flags = 0; - double res[3],y,JtJ,drc; - res[0] = resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]+jac[2]*dr[2]), - res[1] = resid[1]-(jac[3]*dr[0]+jac[4]*dr[1]+jac[5]*dr[2]), - res[2] = resid[2]-(jac[6]*dr[0]+jac[7]*dr[1]+jac[8]*dr[2]); - /* y = J_u^T res */ - y = jac[de]*res[0]+jac[3+de]*res[1]+jac[6+de]*res[2]; - /* JtJ = J_u^T J_u */ - JtJ = jac[ de]*jac[ de] - +jac[3+de]*jac[3+de] - +jac[6+de]*jac[6+de]; - drc = y/JtJ; -#ifdef DIAGNOSTICS_1 - printf(" edge %u, de=%u\n",ei,de); - printf(" r=(%.17g,%.17g,%.17g)\n", r0[0]+dr[0],r0[1]+dr[1],r0[2]+dr[2]); - printf(" resid = (%g,%g,%g); r^T r / 2 = %g\n",res[0],res[1],res[2], - (res[0]*res[0]+res[1]*res[1]+res[2]*res[2])/2); - printf(" min at %.17g\n", r0[de]+dr[de]+drc); -#endif - CHECK_CONSTRAINT(drc,de); - #undef CHECK_CONSTRAINT -#ifdef DIAGNOSTICS_1 - printf(" new_flags = %x, ffac = %.17g\n",new_flags,ffac); -#endif - dr[de] += ffac*drc; - flags |= new_flags; - goto newton_vol_relax; - } - - /* check and possibly relax constraints */ - newton_vol_relax: { - const unsigned old_flags = flags; - double res[3], y[3]; - /* res := res_0 - J dr */ - res[0] = resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]+jac[2]*dr[2]), - res[1] = resid[1]-(jac[3]*dr[0]+jac[4]*dr[1]+jac[5]*dr[2]), - res[2] = resid[2]-(jac[6]*dr[0]+jac[7]*dr[1]+jac[8]*dr[2]); - /* y := J^T res */ - y[0] = jac[0]*res[0]+jac[3]*res[1]+jac[6]*res[2], - y[1] = jac[1]*res[0]+jac[4]*res[1]+jac[7]*res[2], - y[2] = jac[2]*res[0]+jac[5]*res[1]+jac[8]*res[2]; - #define SETDR(d) do { \ - unsigned f = flags>>(2*d) & 3u; \ - if(f) dr[d] = bnd[2*d+(f-1)] - r0[d]; \ - } while(0) - SETDR(0); SETDR(1); SETDR(2); - #undef SETDR - for(d=0;d<3;++d) { - unsigned c = flags>>(2*d) & 3u; - if(c==0) continue; - else if(dr[d]*y[d]<0) flags &= ~(3u<<(2*d)); -#ifdef DIAGNOSTICS_1 - if( (c==1&&dr[d]>0) || (c==2&&dr[d]<0) ) - printf("FAIL! c=%u, dr[d]=%g\n",c,dr[d]); -#endif - } -#ifdef DIAGNOSTICS_1 - printf(" checking constraints (%x)\n",old_flags); - printf(" r=(%.17g,%.17g,%.17g)\n", r0[0]+dr[0],r0[1]+dr[1],r0[2]+dr[2]); - printf(" resid = (%g,%g,%g); r^T r / 2 = %g\n",res[0],res[1],res[2], - (res[0]*res[0]+res[1]*res[1]+res[2]*res[2])/2); - printf(" relaxed %x -> %x\n",old_flags,flags); -#endif - if(flags==old_flags) goto newton_vol_fin; - switch(num_constrained(flags)) { - case 1: goto newton_vol_face; - case 2: goto newton_vol_edge; - } - } - -newton_vol_fin: -#ifdef DIAGNOSTICS_1 - { - const double res[3]={ resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]+jac[2]*dr[2]), - resid[1]-(jac[3]*dr[0]+jac[4]*dr[1]+jac[5]*dr[2]), - resid[2]-(jac[6]*dr[0]+jac[7]*dr[1]+jac[8]*dr[2]) }; - printf(" r=(%.17g,%.17g,%.17g)\n", r0[0]+dr[0],r0[1]+dr[1],r0[2]+dr[2]); - printf(" resid = (%g,%g,%g); r^T r / 2 = %g\n",res[0],res[1],res[2], - (res[0]*res[0]+res[1]*res[1]+res[2]*res[2])/2); - } -#endif - flags &= mask; - if(fabs(dr[0])+fabs(dr[1])+fabs(dr[2]) < tol) flags |= CONVERGED_FLAG; - { - const double res0 = resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]+jac[2]*dr[2]), - res1 = resid[1]-(jac[3]*dr[0]+jac[4]*dr[1]+jac[5]*dr[2]), - res2 = resid[2]-(jac[6]*dr[0]+jac[7]*dr[1]+jac[8]*dr[2]); - out->dist2p=resid[0]*resid[0]+resid[1]*resid[1]+resid[2]*resid[2] - -(res0*res0+res1*res1+res2*res2); - } - #define SETR(d) do { \ - unsigned f = flags>>(2*d) & 3u; \ - out->r[d] = f==0 ? r0[d]+dr[d] : ( f==1 ? -1 : 1 ); \ - } while(0) - SETR(0); SETR(1); SETR(2); - #undef SETR - out->flags = flags | (p->flags<<7); -} - -static void newton_face(struct findpts_el_pt_3 *const out, - const double jac[9], const double rhes[3], - const double resid[3], - const unsigned d1, const unsigned d2, const unsigned dn, - const unsigned flags, - const struct findpts_el_pt_3 *const p, const double tol) -{ - const double tr = p->tr; - double bnd[4]; - double r[2], dr[2]={0,0}; - unsigned mask, new_flags; - double v, tv; unsigned i; - double A[3], y[2], r0[2]; - /* A = J^T J - resid_d H_d */ - A[0] = jac[ d1]*jac[ d1] - +jac[3+d1]*jac[3+d1] - +jac[6+d1]*jac[6+d1] - rhes[0], - A[1] = jac[ d1]*jac[ d2] - +jac[3+d1]*jac[3+d2] - +jac[6+d1]*jac[6+d2] - rhes[1], - A[2] = jac[ d2]*jac[ d2] - +jac[3+d2]*jac[3+d2] - +jac[6+d2]*jac[6+d2] - rhes[2]; - /* y = J^T r */ - y[0] = jac[ d1]*resid[0] - +jac[3+d1]*resid[1] - +jac[6+d1]*resid[2], - y[1] = jac[ d2]*resid[0] - +jac[3+d2]*resid[1] - +jac[6+d2]*resid[2]; - r0[0] = p->r[d1], r0[1] = p->r[d2]; - -#ifdef DIAGNOSTICS_1 - printf("newton_face, dn=%u, (d1,d2)=%u,%u:\n", dn,d1,d2); - printf(" J^T r = (%g,%g)\n", y[0],y[1]); - printf(" A = %g\t%g\n" - " %g\t%g\n", A[0],A[1],A[1],A[2]); - printf(" r = (%.15g,%.15g)\n", r0[0],r0[1]); -#endif - - new_flags=flags; - mask=0x3fu; - if(r0[0]-tr>-1) bnd[0]=-tr, mask^=1u; else bnd[0]=-1-r0[0]; - if(r0[0]+tr< 1) bnd[1]= tr, mask^=2u; else bnd[1]= 1-r0[0]; - if(r0[1]-tr>-1) bnd[2]=-tr, mask^=1u<<2; else bnd[2]=-1-r0[1]; - if(r0[1]+tr< 1) bnd[3]= tr, mask^=2u<<2; else bnd[3]= 1-r0[1]; - -#ifdef DIAGNOSTICS_1 - printf(" bounds = ([%.15g,%.15g],[%.15g,%.15g])\n", - r0[0]+bnd[0],r0[0]+bnd[1],r0[1]+bnd[2],r0[1]+bnd[3]); -#endif - - if(A[0]+A[2]<=0 || A[0]*A[2]<=A[1]*A[1]) goto newton_face_constrained; - lin_solve_sym_2(dr, A,y); - -#ifdef DIAGNOSTICS_1 - printf(" min at r = (%.15g,%.15g)\n", r0[0]+dr[0],r0[1]+dr[1]); -#endif - - #define EVAL(r,s) -(y[0]*r+y[1]*s)+(r*A[0]*r+(2*r*A[1]+s*A[2])*s)/2 - if( (dr[0]-bnd[0])*(bnd[1]-dr[0])>=0 - && (dr[1]-bnd[2])*(bnd[3]-dr[1])>=0) { - r[0] = r0[0]+dr[0], r[1] = r0[1]+dr[1]; - v = EVAL(dr[0],dr[1]); - goto newton_face_fin; - } -newton_face_constrained: - v = EVAL(bnd[0],bnd[2]); i=1u|(1u<<2); - tv = EVAL(bnd[1],bnd[2]); if(tv0) { - double drc; - drc = (y[0] - A[1]*bnd[2])/A[0]; - if((drc-bnd[0])*(bnd[1]-drc)>=0 && (tv=EVAL(drc,bnd[2]))=0 && (tv=EVAL(drc,bnd[3]))0) { - double drc; - drc = (y[1] - A[1]*bnd[0])/A[2]; - if((drc-bnd[2])*(bnd[3]-drc)>=0 && (tv=EVAL(bnd[0],drc))=0 && (tv=EVAL(bnd[1],drc))>(2*d) & 3u; \ - if(f==0) r[d]=r0[d]+dr[d]; \ - else { \ - if((f&(mask>>(2*d)))==0) r[d]=r0[d]+(f==1?-tr:tr); \ - else r[d]=(f==1?-1:1), new_flags |= f<<(2*d3); \ - } \ - } while(0) - SETR(0,d1); SETR(1,d2); -#ifdef DIAGNOSTICS_1 - printf(" constrained min at r = (%.15g,%.15g)\n", r[0],r[1]); -#endif -newton_face_fin: - out->dist2p = -2*v; - dr[0]=r[0]-p->r[d1]; - dr[1]=r[1]-p->r[d2]; - if(fabs(dr[0])+fabs(dr[1]) < tol) new_flags |= CONVERGED_FLAG; - out->r[dn]=p->r[dn], out->r[d1]=r[0],out->r[d2]=r[1]; - out->flags = new_flags | (p->flags<<7); -} - -static void newton_edge(struct findpts_el_pt_3 *const out, - const double jac[9], const double rhes, const double resid[3], - const unsigned de, const unsigned dn1, const unsigned dn2, - unsigned flags, - const struct findpts_el_pt_3 *const p, const double tol) -{ - const double tr = p->tr; - /* A = J^T J - resid_d H_d */ - const double A = jac[ de]*jac[ de] - +jac[3+de]*jac[3+de] - +jac[6+de]*jac[6+de] - rhes; - /* y = J^T r */ - const double y = jac[ de]*resid[0] - +jac[3+de]*resid[1] - +jac[6+de]*resid[2]; - - const double oldr = p->r[de]; - double dr,nr,tdr,tnr; - double v,tv; unsigned new_flags=0, tnew_flags=0; - -#ifdef DIAGNOSTICS_1 - printf("Newton edge %u (dn1=%u,dn2=%u) flags=%x\n",de,dn1,dn2,flags); - printf(" A=%g, y=%g\n",A,y); - if(A<=0) printf(" A not positive\n"); - printf(" r=(%g,%g,%g)\n",p->r[0],p->r[1],p->r[2]); -#endif - - #define EVAL(dr) (dr*A-2*y)*dr - - /* if A is not SPD, quadratic model has no minimum */ - if(A>0) { - dr = y/A, nr = oldr+dr; - if(fabs(dr)-1) dr=-tr; - else nr=-1, dr=-1-oldr, new_flags = flags | 1u<<(2*de); - v =EVAL( dr); - - if((tnr=oldr+tr)< 1) tdr=tr; - else tnr= 1, tdr= 1-oldr, tnew_flags = flags | 2u<<(2*de); - tv=EVAL(tdr); - - if(tvr[de]=nr; - out->r[dn1]=p->r[dn1]; - out->r[dn2]=p->r[dn2]; - out->dist2p = -v; - out->flags = flags | new_flags | (p->flags<<7); -#ifdef DIAGNOSTICS_1 - printf(" new r = (%g,%g,%g)\n",out->r[0],out->r[1],out->r[2]); -#endif -} - -typedef void findpt_fun( - struct findpts_el_pt_3 *const out, - struct findpts_el_data_3 *const fd, - const struct findpts_el_pt_3 *const p, const unsigned pn, const double tol); - -/* work[(3+9+2*(nr+ns+nt+nrs))*pn + max(2*nr,ns) ] */ -static void findpt_vol( - struct findpts_el_pt_3 *const out, - struct findpts_el_data_3 *const fd, - const struct findpts_el_pt_3 *const p, const unsigned pn, const double tol) -{ - const unsigned nr=fd->n[0],ns=fd->n[1],nt=fd->n[2], - nrs=nr*ns; - double *const resid = fd->work, *const jac = resid + 3*pn, - *const wtrs = jac+9*pn, *const wtt = wtrs+2*(nr+ns)*pn, - *const slice = wtt+2*nt*pn, *const temp = slice + 2*pn*nrs; - unsigned i; unsigned d; - /* evaluate x(r) and jacobian */ - for(i=0;ilag[0](wtrs+2*i*(nr+ns) , fd->lag_data[0], nr, 1, p[i].r[0]); - for(i=0;ilag[1](wtrs+2*i*(nr+ns)+2*nr, fd->lag_data[1], ns, 1, p[i].r[1]); - for(i=0;ilag[2](wtt+2*i*nt , fd->lag_data[2], nt, 1, p[i].r[2]); - for(d=0;d<3;++d) { - tensor_mxm(slice,nrs, fd->x[d],nt, wtt,2*pn); - for(i=0;iflags & FLAG_MASK; - const unsigned fi = face_index(pflag); - const unsigned dn = fi>>1, d1 = plus_1_mod_3(dn), d2 = plus_2_mod_3(dn); - const unsigned n1 = fd->n[d1], n2 = fd->n[d2]; - double *const resid=fd->work, *const jac=resid+3*pn, *const hes=jac+9*pn, - *const wt1 = hes+3*pn, *const wt2 = wt1+3*n1*pn, - *const slice = wt2+3*n2*pn; - const struct findpts_el_gface_3 *const face = get_face(fd,fi); - unsigned i; unsigned d; - -#ifdef DIAGNOSTICS_1 - printf("Face %u\n",fi); - printf(" pflag = %u\n",pflag); - printf(" fi = %u\n",fi); - printf(" dn, d1, d2 = %u, %u, %u\n",dn,d1,d2); - printf(" n1, n2 = %u, %u \n", n1,n2); -#endif - - /* evaluate x(r), jacobian, hessian */ - for(i=0;ilag[d1](wt1+3*i*n1, fd->lag_data[d1], n1, 2, p[i].r[d1]); - for(i=0;ilag[d2](wt2+3*i*n2, fd->lag_data[d2], n2, 2, p[i].r[d2]); - for(i=0;i<3*pn;++i) hes[i]=0; - for(d=0;d<3;++d) { - tensor_mxm(slice,n1, face->x[d],n2, wt2,3*pn); - for(i=0;idxdn[d],n2, wt2,pn); - for(i=0;iflags & FLAG_MASK; - const unsigned ei = edge_index(pflag); - const unsigned de = ei>>2, dn1 = plus_1_mod_3(de), dn2 = plus_2_mod_3(de); - const unsigned n = fd->n[de]; - double *wt = fd->work; - const struct findpts_el_gedge_3 *edge = get_edge(fd,ei); - unsigned i; unsigned d; - -#ifdef DIAGNOSTICS_1 - printf("Edge %u\n",ei); - printf(" pflag = %u\n",pflag); - printf(" ei = %u\n",ei); - printf(" de, dn1, dn2 = %u, %u, %u\n",de,dn1,dn2); - printf(" n = %u \n", n); -#endif - - for(i=0;ilag[de](wt, fd->lag_data[de], n, 2, p[i].r[de]); - for(d=0;d<3;++d) { - double r; - tensor_mtxv(dxi,3, wt, edge->x[d],n); - resid[d] = r = p[i].x[d] - dxi[0]; - jac[3*d+de] = dxi[1]; - hes[0] += r * dxi[2]; - tensor_mtxv(dxi,2, wt, edge->dxdn1[d],n); - jac[3*d+dn1] = dxi[0]; - hes[1] += r * dxi[1]; - tensor_mtxv(dxi,2, wt, edge->dxdn2[d],n); - jac[3*d+dn2] = dxi[0]; - hes[2] += r * dxi[1]; - hes[3] += r * tensor_dot(wt, edge->d2xdn1[d], n); - hes[4] += r * tensor_dot(wt, edge->d2xdn2[d], n); - } - /* check prior step */ - if(reject_prior_step_q(out+i,resid,p+i,tol)) continue; - /* check constraint */ - { - double steep[3], sr1, sr2; - steep[0] = jac[0]*resid[0] + jac[3]*resid[1] + jac[6]*resid[2], - steep[1] = jac[1]*resid[0] + jac[4]*resid[1] + jac[7]*resid[2], - steep[2] = jac[2]*resid[0] + jac[5]*resid[1] + jac[8]*resid[2]; - sr1 = steep[dn1]*p[i].r[dn1], - sr2 = steep[dn2]*p[i].r[dn2]; -#ifdef DIAGNOSTICS_1 - printf("jacobian = %g\t%g\t%g\n" - " %g\t%g\t%g\n" - " %g\t%g\t%g\n",jac[0],jac[1],jac[2], - jac[3],jac[4],jac[5],jac[6],jac[7],jac[8]); - printf("hessian = %g\t%g\t%g\n" - " \t%g \n" - " \t \t%g\n", hes[0],hes[1],hes[2],hes[3],hes[4]); - printf("resid = (%g,%g,%g)\n", resid[0],resid[1],resid[2]); - printf("steep1 = %g (%s)\n", steep[dn1], sr1 < 0 ? "in" : "out"); - printf("steep2 = %g (%s)\n", steep[dn2], sr2 < 0 ? "in" : "out"); -#endif - if(sr1<0) { - if(sr2<0) - newton_vol(out+i, jac,resid, p+i, tol); - else { - double rh[3]; rh[0]=hes[0], rh[1]=hes[1], rh[2]=hes[3]; - newton_face(out+i, jac,rh,resid, de,dn1,dn2, - pflag & (3u<<(dn2*2)), p+i, tol); - } - } else if(sr2<0) { - double rh[3]; rh[0]=hes[4], rh[1]=hes[2], rh[2]=hes[0]; - newton_face(out+i, jac,rh,resid, dn2,de,dn1, - pflag & (3u<<(dn1*2)), p+i, tol); - } else - newton_edge(out+i, jac,hes[0],resid, de,dn1,dn2, pflag, p+i, tol); - } - } -} - -static void findpt_pt( - struct findpts_el_pt_3 *const out, - struct findpts_el_data_3 *const fd, - const struct findpts_el_pt_3 *const p, const unsigned pn, const double tol) -{ - const unsigned pflag = p->flags & FLAG_MASK; - const unsigned pi = point_index(pflag); - const struct findpts_el_gpt_3 *gpt = get_pt(fd,pi); - const double *const x = gpt->x, *const jac = gpt->jac, *const hes = gpt->hes; - unsigned i; - -#ifdef DIAGNOSTICS_1 - printf("Point %u\n",pi); - printf(" pflag = %u\n",pflag); - printf(" pi = %u\n",pi); -#endif - - for(i=0;in[0], ns=fd->n[1], nt=fd->n[2]; - unsigned i,j,k, ii=0; - for(p=pt;p!=pe;++p) p->dist2=DBL_MAX; - for(k=0;kz[2][k]; - for(j=0;jz[1][j]; - for(i=0;iz[0][i]; - const double x=fd->x[0][ii], y=fd->x[1][ii], z=fd->x[2][ii]; - ++ii; - for(p=pt;p!=pe;++p) { - const double dx=p->x[0]-x,dy=p->x[1]-y,dz=p->x[2]-z; - const double dist2 = dx*dx+dy*dy+dz*dz; - if(p->dist2<=dist2) continue; - p->dist2=dist2; - p->r[0]=zr, p->r[1]=zs, p->r[2]=zt; - } - } - } - } -} - -void findpts_el_3(struct findpts_el_data_3 *const fd, const unsigned npt, - const double tol) -{ - findpt_fun *const fun[4] = - { &findpt_vol, &findpt_face, &findpt_edge, &findpt_pt }; - struct findpts_el_pt_3 *const pbuf = fd->p, *const pstart = fd->p + npt; - unsigned nconv = npt; - unsigned step = 0; - unsigned count[27] = { 0,0,0, 0,0,0, 0,0,0, - 0,0,0, 0,0,0, 0,0,0, - 0,0,0, 0,0,0, 0,0,0 } ; - count[0] = npt; - seed(fd,pbuf,npt); - { unsigned i; - for(i=0;i1 - { unsigned i; - printf("findpts_el_3 Newton step (%u), %u unconverged:\n ", step,nconv); - for(i=0;i<27;++i) printf(" %u",count[i]); - printf("\n"); - } -#endif -#ifdef DIAGNOSTICS_3 - if(step==50) { - unsigned d, i, n=fd->n[0]*fd->n[1]*fd->n[2]; - printf("geometry:\n{\n"); - for(d=0;d<3;++d) { - printf(" {\n"); - for(i=0;ix[d][i],i==n-1?"":","); - printf(" }%s\n",d==3-1?"":","); - } - printf("}\n"); - } -#endif - - for(p=pstart,pout=pbuf; p!=pe; p+=pn,pout+=pn) { - const unsigned pflags = p->flags & FLAG_MASK; - pn = count[pt_flags_to_bin_noC(pflags)]; - fun[num_constrained(pflags)](pout, fd, p,pn, tol); - } - /* group points by contsraints */ - { - unsigned offset[28] = { 0,0,0, 0,0,0, 0,0,0, - 0,0,0, 0,0,0, 0,0,0, - 0,0,0, 0,0,0, 0,0,0, 0 }; - struct findpts_el_pt_3 *const ppe = pbuf+nconv; - for(pout=pbuf; pout!=ppe; ++pout) - ++offset[pt_flags_to_bin(pout->flags & FLAG_MASK)]; - { - unsigned i; unsigned sum=0; - for(i=0;i<27;++i) { - unsigned ci=offset[i]; count[i]=ci, offset[i]=sum, sum+=ci; - } - nconv = offset[27] = sum; /* last bin is converged; forget it */ - } - for(pout=pbuf; pout!=pe; ++pout) - pstart[offset[pt_flags_to_bin(pout->flags & FLAG_MASK)]++] = *pout; - } - } - { struct findpts_el_pt_3 *p, *const pe=pstart+npt; - for(p=pstart;p!=pe;++p) - pbuf[p->index]=*p, pbuf[p->index].flags&=FLAG_MASK; - } -#if DIAGNOSTICS_ITERATIONS - printf("findpts_el_3 took %u steps\n ", step); -#endif -} - -void findpts_el_eval_3( - double *const out_base, const unsigned out_stride, - const double *const r_base, const unsigned r_stride, const unsigned pn, - const double *const in, struct findpts_el_data_3 *const fd) -{ - const unsigned nr=fd->n[0],ns=fd->n[1],nt=fd->n[2], - nrs=nr*ns; - double *const wtrs = fd->work, *const wtt = wtrs+(nr+ns)*pn, - *const slice = wtt+nt*pn, *const temp = slice + pn*nrs; - unsigned i; const double *r; double *out; - for(i=0,r=r_base;ilag[0](wtrs+i*(nr+ns) , fd->lag_data[0], nr, 0, r[0]); - fd->lag[1](wtrs+i*(nr+ns)+nr, fd->lag_data[1], ns, 0, r[1]); - fd->lag[2](wtt +i*nt , fd->lag_data[2], nt, 0, r[2]); - r = (const double*)((const char*)r + r_stride); - } - - tensor_mxm(slice,nrs, in,nt, wtt,pn); - for(i=0,out=out_base;ihash_n; - return ( WHEN_3D( hash_index_aux(p->bnd[2].min,p->fac[2],n,x[2]) *n ) - +hash_index_aux(p->bnd[1].min,p->fac[1],n,x[1]) )*n - +hash_index_aux(p->bnd[0].min,p->fac[0],n,x[0]); -} - -static void hash_setfac(struct hash_data *p, const ulong n) -{ - unsigned d; - p->hash_n = n; - for(d=0;dfac[d] = n/(p->bnd[d].max-p->bnd[d].min); -} - -static struct ulong_range hash_range(const struct hash_data *p, unsigned d, - const struct dbl_range r) -{ - struct ulong_range ir; - const slong i0 = lfloor( (r.min - p->bnd[d].min) * p->fac[d] ); - const ulong i1 = lceil ( (r.max - p->bnd[d].min) * p->fac[d] ); - ir.min = i0<0 ? 0 : i0; - ir.max = i1hash_n ? i1 : p->hash_n; - if(ir.max==ir.min) ++ir.max; - return ir; -} - -static void hash_bb(struct hash_data *p, const struct local_hash_data *lp, - const struct comm *comm, uint hash_size) -{ - double x[D], buf[D], ghs; - unsigned d; - for(d=0;dbnd[d].min; - comm_allreduce(comm,gs_double,gs_min,x,D,buf); - for(d=0;dbnd[d].min=x[d]; - - for(d=0;dbnd[d].max; - comm_allreduce(comm,gs_double,gs_max,x,D,buf); - for(d=0;dbnd[d].max=x[d]; - - ghs = hash_size; comm_allreduce(comm,gs_double,gs_add,&ghs,1,buf); - hash_setfac(p,lceil(pow(ghs,1./D))); - - #ifdef DIAGNOSTICS - if(comm->id==0) { - printf("global bounding box (%g^%u):\n",(double)p->hash_n,D); - for(d=0;dbnd[d].min,p->bnd[d].max); - } - #endif -} - -static void set_local_mask(unsigned char *const local_mask, - const ulong local_base[D], const uint local_n[D], - const struct hash_data *const p, - const struct obbox *const obb, const uint nel - ) -{ - uint el; - for(el=0;elptr; - #define FOR_LOOP() do { uint bit=0,i,j; WHEN_3D(uint k;) \ - WHEN_3D(for(k=0;kproc = hi%np, hp->index = hi/np; \ - ++hp; \ - } \ - } while(0) - FOR_LOOP(); - #undef FOR_LOOP -} - -static void table_from_hash(struct hash_data *const p, - struct array *const hash, - const uint np, buffer *buf) -{ - const ulong hn = p->hash_n; - ulong hnd; - uint ncell, *offset, i, next_cell; - const struct proc_index *const hp = hash->ptr; - const uint n = hash->n; - hnd = hn*hn; WHEN_3D(hnd*=hn); - ncell = (hnd-1)/np+1; - p->offset = offset = tmalloc(uint,ncell+1+n); - sarray_sort(struct proc_index,hash->ptr,n, index,0, buf); - next_cell = 0; - for(i=0;icomm,hash_size); - for(d=0;dbnd[d]); - local_base[d]=rng.min; - local_n[d]=rng.max-rng.min; - local_ntot*=local_n[d]; - #ifdef DIAGNOSTICS - if(cr->comm.id==0) { - printf("local_range %u: %lu to %lu\n", - d,(unsigned long)rng.min,(unsigned long)rng.max); - } - #endif - } - local_mask = tcalloc(unsigned char, (local_ntot+CHAR_BIT-1)/CHAR_BIT); - set_local_mask(local_mask,local_base,local_n,p,obb,nel); - nc=count_bits(local_mask,(local_ntot+CHAR_BIT-1)/CHAR_BIT); - #ifdef DIAGNOSTICS - printf("findpts_hash(%u): local cells : %u / %u\n",cr->comm.id,nc,local_ntot); - #endif - array_init(struct proc_index,&hash,nc), hash.n=nc; - fill_hash(&hash,local_mask,local_base,local_n,p->hash_n,cr->comm.np); - free(local_mask); - sarray_transfer(struct proc_index,&hash,proc,1,cr); - table_from_hash(p,&hash,cr->comm.np,&cr->data); - array_free(&hash); -} - -static void hash_free(struct hash_data *p) { free(p->offset); } - -struct findpts_data { - struct crystal cr; - struct findpts_local_data local; - struct hash_data hash; -}; - -static void setup_aux( - struct findpts_data *const fd, - const double *const elx[D], - const unsigned n[D], const uint nel, - const unsigned m[D], const double bbox_tol, - const uint local_hash_size, const uint global_hash_size, - const unsigned npt_max, const double newt_tol) -{ - findpts_local_setup(&fd->local,elx,n,nel,m,bbox_tol,local_hash_size, - npt_max, newt_tol); - hash_build(&fd->hash,&fd->local.hd,fd->local.obb,nel, - global_hash_size,&fd->cr); -} - -struct findpts_data *findpts_setup( - const struct comm *const comm, - const double *const elx[D], - const unsigned n[D], const uint nel, - const unsigned m[D], const double bbox_tol, - const uint local_hash_size, const uint global_hash_size, - const unsigned npt_max, const double newt_tol) -{ - struct findpts_data *const fd = tmalloc(struct findpts_data, 1); - crystal_init(&fd->cr,comm); - setup_aux(fd,elx,n,nel,m,bbox_tol, - local_hash_size,global_hash_size,npt_max,newt_tol); - return fd; -} - -void findpts_free(struct findpts_data *fd) -{ - hash_free(&fd->hash); - findpts_local_free(&fd->local); - crystal_free(&fd->cr); - free(fd); -} - -struct src_pt { double x[D]; uint index, proc; }; -struct out_pt { double r[D], dist2; uint index, code, el, proc; }; - -void findpts( uint *const code_base , const unsigned code_stride , - uint *const proc_base , const unsigned proc_stride , - uint *const el_base , const unsigned el_stride , - double *const r_base , const unsigned r_stride , - double *const dist2_base , const unsigned dist2_stride , - const double *const x_base[D], const unsigned x_stride[D], - const uint npt, struct findpts_data *const fd) -{ - const uint np = fd->cr.comm.np, id=fd->cr.comm.id; - struct array hash_pt, src_pt, out_pt; - /* look locally first */ - if(npt) findpts_local( code_base, code_stride, - el_base, el_stride, - r_base, r_stride, - dist2_base,dist2_stride, - x_base, x_stride, - npt,&fd->local,&fd->cr.data); - /* send unfound and border points to global hash cells */ - { - uint index; - uint *code=code_base, *proc=proc_base; - const double *xp[D]; - struct src_pt *pt; - unsigned d; for(d=0;dhash,x); - unsigned dd; - for(dd=0;ddx[dd]=x[dd]; - pt->index=index; - pt->proc=hi%np; - ++pt; - } - for(d=0;dcr); - } - /* look up points in hash cells, route to possible procs */ - { - const uint *const hash_offset = fd->hash.offset; - uint count=0, *proc, *proc_p; - const struct src_pt *p = hash_pt.ptr, *const pe = p+hash_pt.n; - struct src_pt *q; - for(;p!=pe;++p) { - const uint hi = hash_index(&fd->hash,p->x)/np; - const uint i = hash_offset[hi], ie = hash_offset[hi+1]; - count += ie-i; - } - proc_p = proc = tmalloc(uint,count); - array_init(struct src_pt,&src_pt,count), q=src_pt.ptr; - for(p=hash_pt.ptr;p!=pe;++p) { - const uint hi = hash_index(&fd->hash,p->x)/np; - uint i = hash_offset[hi]; const uint ie = hash_offset[hi+1]; - for(;i!=ie;++i) { - const uint pp = hash_offset[i]; - if(pp==p->proc) continue; /* don't send back to source proc */ - *proc_p++ = pp; - *q++ = *p; - } - } - array_free(&hash_pt); - src_pt.n = proc_p-proc; - #ifdef DIAGNOSTICS - printf("(proc %u) hashed; routing %u/%u\n",id,(unsigned)src_pt.n,count); - #endif - sarray_transfer_ext(struct src_pt,&src_pt,proc,sizeof(uint),&fd->cr); - free(proc); - } - /* look for other procs' points, send back */ - { - uint n=src_pt.n; - const struct src_pt *spt; - struct out_pt *opt; - array_init(struct out_pt,&out_pt,n), out_pt.n=n; - spt=src_pt.ptr, opt=out_pt.ptr; - for(;n;--n,++spt,++opt) opt->index=spt->index,opt->proc=spt->proc; - spt=src_pt.ptr, opt=out_pt.ptr; - if(src_pt.n) { - const double *spt_x_base[D]; unsigned spt_x_stride[D]; - unsigned d; for(d=0;dlocal,&fd->cr.data); - } - array_free(&src_pt); - /* group by code to eliminate unfound points */ - sarray_sort(struct out_pt,opt,out_pt.n, code,0, &fd->cr.data); - n=out_pt.n; while(n && opt[n-1].code==CODE_NOT_FOUND) --n; - out_pt.n=n; - #ifdef DIAGNOSTICS - printf("(proc %u) sending back %u found points\n",id,(unsigned)out_pt.n); - #endif - sarray_transfer(struct out_pt,&out_pt,proc,1,&fd->cr); - } - /* merge remote results with user data */ - { - #define AT(T,var,i) (T*)((char*)var##_base+(i)*var##_stride) - uint n=out_pt.n; - struct out_pt *opt; - for(opt=out_pt.ptr;n;--n,++opt) { - const uint index = opt->index; - uint *code = AT(uint,code,index); - double *dist2 = AT(double,dist2,index); - if(*code==CODE_INTERNAL) continue; - if(*code==CODE_NOT_FOUND - || opt->code==CODE_INTERNAL - || opt->dist2<*dist2) { - double *r = AT(double,r,index); - uint *el = AT(uint,el,index), *proc = AT(uint,proc,index); - unsigned d; for(d=0;dr[d]; - *dist2 = opt->dist2; - *proc = opt->proc; - *el = opt->el; - *code = opt->code; - } - } - array_free(&out_pt); - #undef AT - } -} - -struct eval_src_pt { double r[D]; uint index, proc, el; }; -struct eval_out_pt { double out; uint index, proc; }; - -void findpts_eval( - double *const out_base, const unsigned out_stride, - const uint *const code_base, const unsigned code_stride, - const uint *const proc_base, const unsigned proc_stride, - const uint *const el_base, const unsigned el_stride, - const double *const r_base, const unsigned r_stride, - const uint npt, - const double *const in, struct findpts_data *const fd) -{ - struct array src, outpt; - /* copy user data, weed out unfound points, send out */ - { - uint index; - const uint *code=code_base, *proc=proc_base, *el=el_base; - const double *r=r_base; - struct eval_src_pt *pt; - array_init(struct eval_src_pt, &src, npt), pt=src.ptr; - for(index=0;indexr[d]=r[d]; - pt->index=index; - pt->proc=*proc; - pt->el=*el; - ++pt; - } - r = (const double*)((const char*)r + r_stride); - code = (const uint*)((const char*)code+code_stride); - proc = (const uint*)((const char*)proc+proc_stride); - el = (const uint*)((const char*)el + el_stride); - } - src.n = pt - (struct eval_src_pt*)src.ptr; - sarray_transfer(struct eval_src_pt,&src,proc,1,&fd->cr); - } - /* evaluate points, send back */ - { - uint n=src.n; - const struct eval_src_pt *spt; - struct eval_out_pt *opt; - /* group points by element */ - sarray_sort(struct eval_src_pt,src.ptr,n, el,0, &fd->cr.data); - array_init(struct eval_out_pt,&outpt,n), outpt.n=n; - spt=src.ptr, opt=outpt.ptr; - for(;n;--n,++spt,++opt) opt->index=spt->index,opt->proc=spt->proc; - spt=src.ptr, opt=outpt.ptr; - findpts_local_eval(&opt->out ,sizeof(struct eval_out_pt), - &spt->el ,sizeof(struct eval_src_pt), - spt->r ,sizeof(struct eval_src_pt), - src.n, in,&fd->local); - array_free(&src); - sarray_transfer(struct eval_out_pt,&outpt,proc,1,&fd->cr); - } - /* copy results to user data */ - { - #define AT(T,var,i) (T*)((char*)var##_base+(i)*var##_stride) - uint n=outpt.n; - struct eval_out_pt *opt; - for(opt=outpt.ptr;n;--n,++opt) *AT(double,out,opt->index)=opt->out; - array_free(&outpt); - #undef AT - } -} - -#undef findpts_eval -#undef findpts -#undef findpts_free -#undef findpts_setup -#undef setup_aux -#undef eval_out_pt -#undef eval_src_pt -#undef out_pt -#undef src_pt -#undef findpts_data -#undef findpts_local_eval -#undef findpts_local -#undef findpts_local_free -#undef findpts_local_setup -#undef findpts_local_data - -#undef hash_free -#undef hash_build -#undef table_from_hash -#undef fill_hash -#undef set_local_mask -#undef hash_bb -#undef hash_range -#undef hash_setfac -#undef hash_index -#undef hash_data -#undef local_hash_data -#undef obbox diff --git a/3rdParty/gslib/src/findpts_local.c b/3rdParty/gslib/src/findpts_local.c deleted file mode 100644 index ea8096019..000000000 --- a/3rdParty/gslib/src/findpts_local.c +++ /dev/null @@ -1,52 +0,0 @@ -#include - -#include -#include -#include -#include -#include "c99.h" -#include "types.h" -#include "name.h" -#include "fail.h" -#include "mem.h" -#include "obbox.h" -#include "poly.h" -#include "sort.h" -#include "sarray_sort.h" -#include "findpts_el.h" - -struct uint_range { uint min, max; }; -struct index_el { uint index, el; }; - -static struct dbl_range dbl_range_merge(struct dbl_range a, struct dbl_range b) -{ - struct dbl_range m; - m.min = b.minb.max?a.max:b.max; - return m; -} - -static sint ifloor(double x) { return floor(x); } -static sint iceil (double x) { return ceil (x); } - -static uint hash_index_aux(double low, double fac, uint n, double x) -{ - const sint i = ifloor((x-low)*fac); - return i<0 ? 0 : (n-1<(uint)i ? n-1 : (uint)i); -} - -#define CODE_INTERNAL 0 -#define CODE_BORDER 1 -#define CODE_NOT_FOUND 2 - -#define D 2 -#define WHEN_3D(a) -#include "findpts_local_imp.h" -#undef WHEN_3D -#undef D - -#define D 3 -#define WHEN_3D(a) a -#include "findpts_local_imp.h" -#undef WHEN_3D -#undef D diff --git a/3rdParty/gslib/src/findpts_local.h b/3rdParty/gslib/src/findpts_local.h deleted file mode 100644 index 88c42d33a..000000000 --- a/3rdParty/gslib/src/findpts_local.h +++ /dev/null @@ -1,96 +0,0 @@ -#ifndef FINDPTS_LOCAL_H -#define FINDPTS_LOCAL_H - -#if !defined(MEM_H) || !defined(FINDPTS_EL_H) || !defined(OBBOX_H) -#warning "findpts_local.h" requires "mem.h", "findpts_el.h", "obbox.h" -#endif - -#define findpts_local_setup_2 PREFIXED_NAME(findpts_local_setup_2) -#define findpts_local_free_2 PREFIXED_NAME(findpts_local_free_2 ) -#define findpts_local_2 PREFIXED_NAME(findpts_local_2 ) -#define findpts_local_eval_2 PREFIXED_NAME(findpts_local_eval_2 ) - -struct findpts_local_hash_data_2 { - uint hash_n; - struct dbl_range bnd[2]; - double fac[2]; - uint *offset; - uint max; -}; - -struct findpts_local_data_2 { - unsigned ntot; - const double *elx[2]; - struct obbox_2 *obb; - struct findpts_local_hash_data_2 hd; - struct findpts_el_data_2 fed; - double tol; -}; - -void findpts_local_setup_2(struct findpts_local_data_2 *const fd, - const double *const elx[2], - const unsigned n[2], const uint nel, - const unsigned m[2], const double bbox_tol, - const uint max_hash_size, - const unsigned npt_max, const double newt_tol); -void findpts_local_free_2(struct findpts_local_data_2 *const fd); -void findpts_local_2( - uint *const code_base , const unsigned code_stride , - uint *const el_base , const unsigned el_stride , - double *const r_base , const unsigned r_stride , - double *const dist2_base , const unsigned dist2_stride , - const double *const x_base[2], const unsigned x_stride[2], - const uint npt, struct findpts_local_data_2 *const fd, - buffer *buf); -void findpts_local_eval_2( - double *const out_base, const unsigned out_stride, - const uint *const el_base, const unsigned el_stride, - const double *const r_base, const unsigned r_stride, - const uint npt, - const double *const in, struct findpts_local_data_2 *const fd); - -#define findpts_local_setup_3 PREFIXED_NAME(findpts_local_setup_3) -#define findpts_local_free_3 PREFIXED_NAME(findpts_local_free_3 ) -#define findpts_local_3 PREFIXED_NAME(findpts_local_3 ) -#define findpts_local_eval_3 PREFIXED_NAME(findpts_local_eval_3 ) - -struct findpts_local_hash_data_3 { - uint hash_n; - struct dbl_range bnd[3]; - double fac[3]; - uint *offset; - uint max; -}; - -struct findpts_local_data_3 { - unsigned ntot; - const double *elx[3]; - struct obbox_3 *obb; - struct findpts_local_hash_data_3 hd; - struct findpts_el_data_3 fed; - double tol; -}; - -void findpts_local_setup_3(struct findpts_local_data_3 *const fd, - const double *const elx[3], - const unsigned n[3], const uint nel, - const unsigned m[3], const double bbox_tol, - const uint max_hash_size, - const unsigned npt_max, const double newt_tol); -void findpts_local_free_3(struct findpts_local_data_3 *const fd); -void findpts_local_3( - uint *const code_base , const unsigned code_stride , - uint *const el_base , const unsigned el_stride , - double *const r_base , const unsigned r_stride , - double *const dist2_base , const unsigned dist2_stride , - const double *const x_base[3], const unsigned x_stride[3], - const uint npt, struct findpts_local_data_3 *const fd, - buffer *buf); -void findpts_local_eval_3( - double *const out_base, const unsigned out_stride, - const uint *const el_base, const unsigned el_stride, - const double *const r_base, const unsigned r_stride, - const uint npt, - const double *const in, struct findpts_local_data_3 *const fd); - -#endif diff --git a/3rdParty/gslib/src/findpts_local_imp.h b/3rdParty/gslib/src/findpts_local_imp.h deleted file mode 100644 index e1e77427e..000000000 --- a/3rdParty/gslib/src/findpts_local_imp.h +++ /dev/null @@ -1,388 +0,0 @@ - -#define obbox TOKEN_PASTE(obbox_ ,D) -#define obbox_calc TOKEN_PASTE(PREFIXED_NAME(obbox_calc_),D) -#define obbox_test TOKEN_PASTE(obbox_test_ ,D) -#define hash_data TOKEN_PASTE(findpts_local_hash_data_,D) -#define hash_index TOKEN_PASTE(hash_index_ ,D) -#define hash_setfac TOKEN_PASTE(hash_setfac_ ,D) -#define hash_range TOKEN_PASTE(hash_range_ ,D) -#define hash_count TOKEN_PASTE(hash_count_ ,D) -#define hash_opt_size TOKEN_PASTE(hash_opt_size_ ,D) -#define hash_bb TOKEN_PASTE(hash_bb_ ,D) -#define hash_build TOKEN_PASTE(hash_build_ ,D) -#define hash_free TOKEN_PASTE(hash_free_ ,D) -#define findpts_el_data TOKEN_PASTE(findpts_el_data_ ,D) -#define findpts_el_pt TOKEN_PASTE(findpts_el_pt_ ,D) -#define findpts_el_setup TOKEN_PASTE(PREFIXED_NAME(findpts_el_setup_),D) -#define findpts_el_free TOKEN_PASTE(PREFIXED_NAME(findpts_el_free_ ),D) -#define findpts_el TOKEN_PASTE(PREFIXED_NAME(findpts_el_ ),D) -#define findpts_el_eval TOKEN_PASTE(PREFIXED_NAME(findpts_el_eval_ ),D) -#define findpts_el_start TOKEN_PASTE(findpts_el_start_ ,D) -#define findpts_el_points TOKEN_PASTE(findpts_el_points_ ,D) -#define findpts_local_data TOKEN_PASTE(findpts_local_data_,D) -#define map_points_to_els TOKEN_PASTE(map_points_to_els_ ,D) -#define findpts_local_setup TOKEN_PASTE(PREFIXED_NAME(findpts_local_setup_),D) -#define findpts_local_free TOKEN_PASTE(PREFIXED_NAME(findpts_local_free_ ),D) -#define findpts_local TOKEN_PASTE(PREFIXED_NAME(findpts_local_ ),D) -#define findpts_local_eval TOKEN_PASTE(PREFIXED_NAME(findpts_local_eval_ ),D) - -/*-------------------------------------------------------------------------- - Point to Possible Elements Hashing - - Initializing the data: - uint nel; // number of elements - uint max_size = nr*ns*nt*nel; // maximum size of hash table - struct obbox *obb = ...; // bounding boxes for elements - - hash_data data; - hash_build(&data, obb, nel, max_size); - - Using the data: - double x[3]; // point to find - - uint index = hash_index_3(&data, x); - uint i, b = data.offset[index], e = data.offset[index+1]; - - // point may be in elements - // data.offset[b], data.offset[b+1], ... , data.offset[e-1] - // - // list has maximum size data.max (e.g., e-b <= data.max) - - for(i=b; i!=e; ++i) { - uint el = data.offset[i]; - ... - } - - When done: - hash_free(&data); - - --------------------------------------------------------------------------*/ - -struct hash_data { - uint hash_n; - struct dbl_range bnd[D]; - double fac[D]; - uint *offset; - uint max; -}; - -static uint hash_index(const struct hash_data *p, const double x[D]) -{ - const uint n = p->hash_n; - return ( WHEN_3D( hash_index_aux(p->bnd[2].min,p->fac[2],n,x[2]) *n ) - +hash_index_aux(p->bnd[1].min,p->fac[1],n,x[1]) )*n - +hash_index_aux(p->bnd[0].min,p->fac[0],n,x[0]); -} - -static void hash_setfac(struct hash_data *p, const uint n) -{ - unsigned d; - p->hash_n = n; - for(d=0;dfac[d] = n/(p->bnd[d].max-p->bnd[d].min); -} - -static struct uint_range hash_range(const struct hash_data *p, unsigned d, - const struct dbl_range r) -{ - struct uint_range ir; - const sint i0 = ifloor( (r.min - p->bnd[d].min) * p->fac[d] ); - const uint i1 = iceil ( (r.max - p->bnd[d].min) * p->fac[d] ); - ir.min = i0<0 ? 0 : i0; - ir.max = i1hash_n ? i1 : p->hash_n; - if(ir.max==ir.min) ++ir.max; - return ir; -} - -static uint hash_count(struct hash_data *p, - const struct obbox *const obb, const uint nel, - const uint n) -{ - uint i,count=0; - hash_setfac(p,n); - for(i=0;i1) { - uint nm = nl+(nu-nl)/2, nmd = nm*nm, size; - WHEN_3D(nmd *= nm); - size = nmd+1+hash_count(p,obb,nel,nm); - if(size<=max_size) nl=nm,size_low=size; else nu=nm; - } - hash_setfac(p,nl); - return size_low; -} - -static void hash_bb(struct hash_data *p, - const struct obbox *const obb, const uint nel) -{ - uint el; unsigned d; - struct dbl_range bnd[D]; - if(nel) { - for(d=0;dbnd[d]=bnd[d]; - } else { - for(d=0;dbnd[d].max=p->bnd[d].min=0; - } -} - -static void hash_build(struct hash_data *p, - const struct obbox *const obb, const uint nel, - const uint max_size) -{ - uint i,el,size,hn,hnd,sum,max, *count; - hash_bb(p,obb,nel); - size = hash_opt_size(p,obb,nel,max_size); - p->offset = tmalloc(uint,size); - hn = p->hash_n; - hnd = hn*hn; WHEN_3D(hnd*=hn); - count = tcalloc(uint,hnd); - for(el=0;eloffset[0]=sum; - for(i=0;imax?count[i]:max; - sum += count[i]; - p->offset[i+1] = sum; - } - p->max = max; - for(el=0;eloffset[p->offset[index+1]-count[index]]=el; \ - --count[index]; \ - } \ - } while(0) - FOR_LOOP(); - #undef FOR_LOOP - } - free(count); -} - -static void hash_free(struct hash_data *p) { free(p->offset); } - -struct findpts_local_data { - unsigned ntot; - const double *elx[D]; - struct obbox *obb; - struct hash_data hd; - struct findpts_el_data fed; - double tol; -}; - -void findpts_local_setup(struct findpts_local_data *const fd, - const double *const elx[D], - const unsigned n[D], const uint nel, - const unsigned m[D], const double bbox_tol, - const uint max_hash_size, - const unsigned npt_max, const double newt_tol) -{ - unsigned d; - unsigned ntot=n[0]; for(d=1;dntot = ntot; - for(d=0;delx[d]=elx[d]; - fd->obb=tmalloc(struct obbox,nel); - obbox_calc(fd->obb,elx,n,nel,m,bbox_tol); - hash_build(&fd->hd,fd->obb,nel,max_hash_size); - findpts_el_setup(&fd->fed,n,npt_max); - fd->tol = newt_tol; -} - -void findpts_local_free(struct findpts_local_data *const fd) -{ - findpts_el_free(&fd->fed); - hash_free(&fd->hd); - free(fd->obb); -} - -static void map_points_to_els( - struct array *const map, - uint *const code_base , const unsigned code_stride , - const double *const x_base[D], const unsigned x_stride[D], - const uint npt, const struct findpts_local_data *const fd, - buffer *buf) -{ - uint index; - const double *xp[D]; uint *code=code_base; - unsigned d; for(d=0;d>2)+1); - for(index=0;indexhd,x); - const uint *elp = fd->hd.offset + fd->hd.offset[hi ], - *const ele = fd->hd.offset + fd->hd.offset[hi+1]; - *code = CODE_NOT_FOUND; - for(; elp!=ele; ++elp) { - const uint el = *elp; - if(obbox_test(&fd->obb[el],x)>=0) { - struct index_el *const p = - array_reserve(struct index_el,map,map->n+1); - p[map->n].index = index; - p[map->n].el = el; - ++map->n; - } - } - } - for(d=0;dptr,map->n, el,0, buf); - /* add sentinel */ - { - struct index_el *const p = - array_reserve(struct index_el,map,map->n+1); - p[map->n].el = -(uint)1; - } -} - -#define AT(T,var,i) \ - (T*)( (char*)var##_base +(i)*var##_stride ) -#define CAT(T,var,i) \ - (const T*)((const char*)var##_base +(i)*var##_stride ) -#define CATD(T,var,i,d) \ - (const T*)((const char*)var##_base[d]+(i)*var##_stride[d]) - -void findpts_local( - uint *const code_base , const unsigned code_stride , - uint *const el_base , const unsigned el_stride , - double *const r_base , const unsigned r_stride , - double *const dist2_base , const unsigned dist2_stride , - const double *const x_base[D], const unsigned x_stride[D], - const uint npt, struct findpts_local_data *const fd, - buffer *buf) -{ - struct findpts_el_data *const fed = &fd->fed; - struct findpts_el_pt *const fpt = findpts_el_points(fed); - struct array map; /* point -> element map */ - map_points_to_els(&map, code_base,code_stride, x_base,x_stride, npt, fd, buf); - { - const unsigned npt_max = fd->fed.npt_max; - const struct index_el *p, *const pe = (struct index_el *)map.ptr+map.n; - for(p=map.ptr;p!=pe;) { - const uint el = p->el, el_off=el*fd->ntot; - const double *elx[D]; - unsigned d; - for(d=0;delx[d]+el_off; - findpts_el_start(fed,elx); - do { - const struct index_el *q; - unsigned i; - for(i=0,q=p;iel==el;++q) { - uint *code = AT(uint,code,q->index); - if(*code==CODE_INTERNAL) continue; - for(d=0;dindex,d); - ++i; - } - findpts_el(fed,i,fd->tol); - for(i=0,q=p;iel==el;++q) { - const uint index=q->index; - uint *code = AT(uint,code,index); - double *dist2 = AT(double,dist2,index); - if(*code==CODE_INTERNAL) continue; - if(*code==CODE_NOT_FOUND - || fpt[i].flags==(1u<<(2*D)) /* converged, no constraints */ - || fpt[i].dist2<*dist2) { - double *r = AT(double,r,index); - uint *eli = AT(uint,el,index); - *eli = el; - *code = fpt[i].flags==(1u<<(2*D)) ? CODE_INTERNAL : CODE_BORDER; - *dist2 = fpt[i].dist2; - for(d=0;del==el); - } - } - array_free(&map); -} - -/* assumes points are already grouped by elements */ -void findpts_local_eval( - double *const out_base, const unsigned out_stride, - const uint *const el_base, const unsigned el_stride, - const double *const r_base, const unsigned r_stride, - const uint npt, - const double *const in, struct findpts_local_data *const fd) -{ - struct findpts_el_data *const fed = &fd->fed; - const unsigned npt_max = fed->npt_max; - uint p; - for(p=0;pntot; - do { - unsigned i; uint q; - for(i=0,q=p;i -#include -#include - -#define PREC_BITS 256 -#define DIGITS 50 - -#define GLL_LAG_FIX_MAX 24 - -#if 1 -# define STATIC "static " -#else -# define STATIC "" -#endif - - -#define PI 3.1415926535897932384626433832795028841971693993751058209749445923 - -#define DECLARE_1VAR(a) static int init=0; static mpf_t a; \ - if(!init) init=1, mpf_init(a) -#define DECLARE_2VARS(a,b) static int init=0; static mpf_t a,b; \ - if(!init) init=1, mpf_init(a), mpf_init(b) -#define DECLARE_3VARS(a,b,c) static int init=0; static mpf_t a,b,c; \ - if(!init) init=1, mpf_init(a), mpf_init(b), \ - mpf_init(c) -#define DECLARE_4VARS(a,b,c,d) static int init=0; static mpf_t a,b,c,d; \ - if(!init) init=1, mpf_init(a), mpf_init(b), \ - mpf_init(c), mpf_init(d) - -static int is_small(const mpf_t x, const mpf_t y) { - DECLARE_2VARS(xa,ya); - mpf_abs(xa,x); - mpf_abs(ya,y); - mpf_div_2exp(ya,ya,PREC_BITS-mp_bits_per_limb); - return mpf_cmp(xa,ya) < 0; -} - -typedef void fun_3term(mpf_t Pn, int n, const mpf_t x); - -#define DECLARE_THREE_TERM(name, i0_init, init_Ps, a_ip1,a_i,a_im1) \ -static void name(mpf_t Pn, int n, const mpf_t x) \ -{ \ - int i, i0_init; \ - DECLARE_4VARS(a,b,P_im1,P_i); \ - init_Ps; \ - for(i=i0+1; ii0?P_i:P_im1); \ -} - -DECLARE_THREE_TERM(legendre, i0=0,(mpf_set_ui(P_im1,1),mpf_set (P_i,x)), - i+1, 2*i+1, i ) -DECLARE_THREE_TERM(legendre_d1, i0=0,(mpf_set_ui(P_im1,0),mpf_set_ui(P_i,1)), - i , 2*i+1, i+1) -DECLARE_THREE_TERM(legendre_d2, i0=1,(mpf_set_ui(P_im1,0),mpf_set_ui(P_i,3)), - i-1, 2*i+1, i+2) - -static void newton(mpf_t x, double seed, - fun_3term *fun, fun_3term *der, int n) -{ - DECLARE_3VARS(ox,f,df); - mpf_set_d(x, seed); - do { - mpf_set(ox, x); - fun(f, n,x), der(df, n,x), mpf_div(f, f,df), mpf_sub(x, x,f); - } while(!is_small(f,x)); - fun( f, n,x), der(df, n,x), mpf_div(f, f,df), mpf_sub(x, x,f); -} - -static void gauss_node(mpf_t z, int n, int i) { - if( (n&1) && i==n/2 ) mpf_set_ui(z,0); - else newton(z, cos( (2*n-2*i-1)*(PI/2)/n ), legendre,legendre_d1,n); -} - -static void lobatto_node(mpf_t z, int n, int i) { - if( (n&1) && i==n/2 ) mpf_set_ui(z,0); - else if(i==0) mpf_set_d(z,-(double)1); - else if(i==n-1) mpf_set_ui(z,1); - else newton(z, cos( (n-1-i)*PI/(n-1) ), legendre_d1,legendre_d2,n-1); -} - -#define PRINT_LIST(i, i0,nline,n, printi,sep,sepline) \ - do { \ - int i; \ - for(i=i0;i3) { - printf("static const double gllz_%02d[%2d] = {\n ",n,n/2-1); - for(i=1;i<=n/2-1;++i) { - lobatto_node(z, n,n-1-i); - if(i!=1) printf(",\n "); - gmp_printf("%.*Fg",DIGITS,z); - } - puts("\n};\n"); - } - printf(STATIC "void gll_lag_%02d(double *restrict p, double *restrict w,\n" - " unsigned n, int d, double xh)\n{\n",n); - printf(" const double x = xh*2;\n"); - #define PRINT_D(i) do { \ - printf("d%02d=x",i); \ - if(2*i+1==n) printf(" "); \ - else if(i==0) printf("+2 "); \ - else if(i==n-1) printf("-2 "); \ - else if(i=n-2?printf(" %d",n-1-(i)):printf("v1_%02d",i)) - #define PRINT_U2(i) (i<=1 ?printf(" 0"): \ - (i==2 ?printf(" 2"):printf("u2_%02d",i))) - #define PRINT_V2(i) (i>=n-2?printf(" 0"): \ - (i==n-3?printf(" 2"):printf("v2_%02d",i))) - printf("%s",";\n const double "); - PRINT_LIST(i, 1,3,n, - (PRINT_U0(i),putchar('='),PRINT_U0(i-1),printf("*d%02d",i-1)), - ",",",\n "); - printf("%s",";\n const double "); - PRINT_LIST(i, 1,3,n, - (PRINT_V0(n-1-i),putchar('='),printf("d%02d*",n-i),PRINT_V0(n-i)), - ",",",\n "); - printf("%s",";\n "); - PRINT_LIST(i, 0,3,n, - (printf("p[%2d]=w[%2d]*",i,i),PRINT_U0(i),putchar('*'), - PRINT_V0(i)),"; ",";\n "); - puts(";\n if(d>0) {"); - if(n>2) { - printf("%s"," const double "); - PRINT_LIST(i, 2,2,n, - (PRINT_U1(i),putchar('='),PRINT_U1(i-1),printf("*d%02d",i-1), - putchar('+'),PRINT_U0(i-1)), - ",",",\n "); - printf("%s",";\n const double "); - PRINT_LIST(i, 2,2,n, - (PRINT_V1(n-1-i),putchar('='),printf("d%02d*",n-i),PRINT_V1(n-i), - putchar('+'),PRINT_V0(n-i)), - ",",",\n "); - puts(";"); - } - for(i=0;i1) {"); - if(n>3) { - printf("%s"," const double "); - PRINT_LIST(i, 3,2,n, - (PRINT_U2(i),putchar('='),PRINT_U2(i-1),printf("*d%02d",i-1), - printf("+2*"),PRINT_U1(i-1)), - ",",",\n "); - printf("%s",";\n const double "); - PRINT_LIST(i, 3,2,n, - (PRINT_V2(n-1-i),putchar('='),printf("d%02d*",n-i),PRINT_V2(n-i), - printf("+2*"),PRINT_V1(n-i)), - ",",",\n "); - puts(";"); - } - if(n<3) for(i=0;i1) - PRINT_U2(i),putchar('*'),PRINT_V0(i); - else printf(" "); - if(i>0 && i -#include -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" - -#define gs_op gs_op_t /* fix conflict with fortran */ - -#include "gs_defs.h" -#include "gs_local.h" -#include "comm.h" -#include "mem.h" -#include "sort.h" -#include "crystal.h" -#include "sarray_sort.h" -#include "sarray_transfer.h" - -#define gs PREFIXED_NAME(gs ) -#define gs_vec PREFIXED_NAME(gs_vec ) -#define gs_many PREFIXED_NAME(gs_many ) -#define igs PREFIXED_NAME(igs ) -#define igs_vec PREFIXED_NAME(igs_vec ) -#define igs_many PREFIXED_NAME(igs_many ) -#define gs_wait PREFIXED_NAME(gs_wait ) -#define gs_setup PREFIXED_NAME(gs_setup ) -#define gs_free PREFIXED_NAME(gs_free ) -#define gs_unique PREFIXED_NAME(gs_unique) -#define gs_hf2c PREFIXED_NAME(gs_hf2c ) - -GS_DEFINE_DOM_SIZES() - -typedef enum { mode_plain, mode_vec, mode_many, - mode_dry_run } gs_mode; - -static buffer static_buffer = null_buffer; - -static void gather_noop( - void *out, const void *in, const unsigned vn, - const uint *map, gs_dom dom, gs_op op) -{} - -static void scatter_noop( - void *out, const void *in, const unsigned vn, - const uint *map, gs_dom dom) -{} - -static void init_noop( - void *out, const unsigned vn, - const uint *map, gs_dom dom, gs_op op) -{} - -/*------------------------------------------------------------------------------ - Topology Discovery -------------------------------------------------------------------------------*/ - -struct gs_topology { - ulong total_shared; /* number of globally unique shared ids */ - struct array nz; /* array of nonzero_id's, grouped by id, - sorted by primary index, then flag, then index */ - struct array sh; /* array of shared_id's, arbitrary ordering */ - struct array pr; /* array of primary_shared_id's */ -}; - -static void gs_topology_free(struct gs_topology *top) -{ - array_free(&top->pr); - array_free(&top->sh); - array_free(&top->nz); -} - -/************** Local topology **************/ - -/* nonzero_ids (local part) - - Creates an array of s_nonzeros, one per nonzero in user id array. The - output array is grouped by id. Within each group, non-flagged entries come - first; otherwise the entries within the group are sorted by the index into - the user id array. The first index in each group is the primary index, and - is stored along with each entry. The groups themselves are ordered in - increasing order of the primary index associated with the group (as opposed - to the user id). */ - -struct nonzero_id { - ulong id; uint i, flag, primary; -}; - -static void nonzero_ids(struct array *nz, - const slong *id, const uint n, buffer *buf) -{ - ulong last_id = ULONG_MAX; - uint i, primary = UINT_MAX; - struct nonzero_id *row, *end; - array_init(struct nonzero_id,nz,n), end=row=nz->ptr; - for(i=0;ii = i; - end->id = abs_id; - end->flag = id_i!=abs_id; - ++end; - } - nz->n = end-row; - array_resize(struct nonzero_id,nz,nz->n); - sarray_sort_2(struct nonzero_id,nz->ptr,nz->n, id,1, flag,0, buf); - for(row=nz->ptr,end=row+nz->n;row!=end;++row) { - ulong this_id = row->id; - if(this_id!=last_id) primary = row->i; - row->primary = primary; - last_id = this_id; - } - sarray_sort(struct nonzero_id,nz->ptr,nz->n, primary,0, buf); -} - -/************** Global topology **************/ - -/* construct list of all unique id's on this proc */ -struct unique_id { ulong id; uint work_proc, src_if; }; -static void unique_ids(struct array *un, const struct array *nz, const uint np) -{ - struct unique_id *un_row; - const struct nonzero_id *nz_row, *nz_end; - array_init(struct unique_id,un,nz->n), un_row=un->ptr; - for(nz_row=nz->ptr,nz_end=nz_row+nz->n;nz_row!=nz_end;++nz_row) { - if(nz_row->i != nz_row->primary) continue; - un_row->id = nz_row->id; - un_row->work_proc = nz_row->id%np; - un_row->src_if = nz_row->flag ? ~nz_row->i : nz_row->i; - ++un_row; - } - un->n = un_row - (struct unique_id*)un->ptr; -} - -/* shared_ids (global part) - - Creates an array of shared_id's from an array of nonzero_id's. Each entry - in the output identifies one id shared with one other processor p. - Note: two procs share an id only when at least one of them has it unflagged. - The primary index is i locally and ri remotely. Bit 1 of flags indicates - the local flag, bit 2 indicates the remote flag. The output has no - particular ordering. - - Also creates an array of primary_shared_id's, one for each shared id. - This struct includes ord, a global rank of the id (arbitrary, but unique). */ - -#define FLAGS_LOCAL 1 -#define FLAGS_REMOTE 2 - -/* i : local primary index - p : remote proc - ri : remote primary index - bi : buffer index (set and used during pw setup) */ -struct shared_id { - ulong id; uint i, p, ri, bi; unsigned flags; -}; - -struct primary_shared_id { - ulong id, ord; uint i; unsigned flag; -}; - - - -struct shared_id_work { ulong id,ord; uint p1, p2, i1f, i2f; }; -static void shared_ids_aux(struct array *sh, struct array *pr, uint pr_n, - struct array *wa, buffer *buf) -{ - const struct shared_id_work *w, *we; - struct shared_id *s; - struct primary_shared_id *p; - ulong last_id = ULONG_MAX; - /* translate work array to output arrays */ - sarray_sort(struct shared_id_work,wa->ptr,wa->n, id,1, buf); - array_init(struct shared_id,sh,wa->n), sh->n=wa->n, s=sh->ptr; - array_init(struct primary_shared_id,pr,pr_n), p=pr->ptr; - for(w=wa->ptr,we=w+wa->n;w!=we;++w) { - uint i1f = w->i1f, i2f = w->i2f; - uint i1 = ~i1fid=w->id, s->i=i1, s->p=w->p2, s->ri=i2; - s->flags = ((i2f^i2)&FLAGS_REMOTE) | ((i1f^i1)&FLAGS_LOCAL); - ++s; - if(w->id!=last_id) { - last_id=w->id; - p->id=last_id, p->ord=w->ord, p->i=i1, p->flag=(i1f^i1)&FLAGS_LOCAL; - ++p; - } - } - pr->n = p-(struct primary_shared_id*)pr->ptr; - sarray_sort(struct primary_shared_id,pr->ptr,pr->n, i,0, buf); -} - -static ulong shared_ids(struct array *sh, struct array *pr, - const struct array *nz, struct crystal *cr) -{ - struct array un; struct unique_id *un_row, *un_end, *other; - ulong last_id = ULONG_MAX; - ulong ordinal[2], n_shared=0, scan_buf[2]; - struct array wa; struct shared_id_work *w; - uint n_unique; - /* construct list of all unique id's on this proc */ - unique_ids(&un,nz,cr->comm.np); - n_unique = un.n; - /* transfer list to work procs */ - sarray_transfer(struct unique_id,&un, work_proc,1, cr); - /* group by id, put flagged entries after unflagged (within each group) */ - sarray_sort_2(struct unique_id,un.ptr,un.n, id,1, src_if,0, &cr->data); - /* count shared id's */ - for(un_row=un.ptr,un_end=un_row+un.n;un_row!=un_end;++un_row) { - ulong id = un_row->id; - if(~un_row->src_ifsrc_if) continue; - if(id==last_id) continue; - other=un_row+1; - if(other!=un_end&&other->id==id) last_id=id, ++n_shared; - } - comm_scan(ordinal, &cr->comm,gs_slong,gs_add, &n_shared,1, scan_buf); - /* there are ordinal[1] globally shared unique ids; - and ordinal[0] of those are seen by work procs of lower rank; - i.e., this work processor sees the range ordinal[0] + (0,n_shared-1) */ - /* construct list of shared ids */ - last_id = ULONG_MAX; - array_init(struct shared_id_work,&wa,un.n), wa.n=0, w=wa.ptr; - for(un_row=un.ptr,un_end=un_row+un.n;un_row!=un_end;++un_row) { - ulong id = un_row->id; - uint p1 = un_row->work_proc, i1f = un_row->src_if; - if(~i1fid==id;++other) { - uint p2 = other->work_proc, i2f = other->src_if; - ulong ord; - if(id!=last_id) last_id=id, ++ordinal[0]; - ord=ordinal[0]-1; - if(wa.n+2>wa.max) - array_reserve(struct shared_id_work,&wa,wa.n+2), - w=(struct shared_id_work*)wa.ptr+wa.n; - w->id=id, w->ord=ord, w->p1=p1, w->p2=p2, w->i1f=i1f, w->i2f=i2f, ++w; - w->id=id, w->ord=ord, w->p1=p2, w->p2=p1, w->i1f=i2f, w->i2f=i1f, ++w; - wa.n+=2; - } - } - /* transfer shared list to source procs */ - sarray_transfer(struct shared_id_work,&wa, p1,0, cr); - /* fill output arrays from work array */ - shared_ids_aux(sh,pr,n_unique,&wa,&cr->data); - array_free(&un); - array_free(&wa); - return ordinal[1]; -} - -static void get_topology(struct gs_topology *top, - const slong *id, uint n, struct crystal *cr) -{ - nonzero_ids(&top->nz,id,n,&cr->data); - top->total_shared = shared_ids(&top->sh,&top->pr, &top->nz,cr); -} - -static void make_topology_unique(struct gs_topology *top, slong *id, - uint pid, buffer *buf) -{ - struct array *const nz=&top->nz, *const sh=&top->sh, *const pr=&top->pr; - struct nonzero_id *pnz; - struct shared_id *pb, *pe, *e, *out; - struct primary_shared_id *q; - - /* flag local non-primaries */ - sarray_sort(struct nonzero_id,nz->ptr,nz->n, i,0, buf); - if(id) { - struct nonzero_id *p,*ee; - for(p=nz->ptr,ee=p+nz->n;p!=ee;++p) - if(p->i != p->primary) id[p->i]=-(slong)p->id,p->flag=1; - } else { - struct nonzero_id *p,*ee; - for(p=nz->ptr,ee=p+nz->n;p!=ee;++p) - if(p->i != p->primary) p->flag=1; - } - sarray_sort(struct nonzero_id,nz->ptr,nz->n, primary,0, buf); - - /* assign owner among shared primaries */ - - /* create sentinel with i = -1 */ - array_reserve(struct shared_id,sh,sh->n+1); - ((struct shared_id*)sh->ptr)[sh->n].i = UINT_MAX; - /* in the sorted list of procs sharing a given id, - the owner is chosen to be the j^th unflagged proc, - where j = id mod (length of list) */ - sarray_sort_2(struct shared_id,sh->ptr,sh->n, i,0, p,0, buf); - out=sh->ptr; pnz=top->nz.ptr; - for(pb=sh->ptr,e=pb+sh->n;pb!=e;pb=pe) { - uint i = pb->i, lt=0,gt=0, owner; struct shared_id *p; - while(pnz->i!=i) ++pnz; - /* note: current proc not in list */ - for(pe=pb; pe->i==i && pe->pflags&FLAGS_REMOTE)) ++lt; - for( ; pe->i==i ; ++pe) if(!(pe->flags&FLAGS_REMOTE)) ++gt; - if(!(pb->flags&FLAGS_LOCAL)) { - owner = pb->id%(lt+gt+1); - if(owner==lt) goto make_sh_unique_mine; - if(owner>lt) --owner; - } else - owner = pb->id%(lt+gt); - /* we don't own pb->id */ - if(id) id[i] = -(slong)pb->id; - pnz->flag=1; - /* we only share this id with the owner now; remove the other entries */ - for(p=pb; p!=pe; ++p) if(!(p->flags&FLAGS_REMOTE) && !(owner--)) break; - if(p!=pe) *out=*p, out->flags=FLAGS_LOCAL, ++out; - continue; - make_sh_unique_mine: - /* we own pb->id */ - if(out==pb) { out=pe; for(p=pb; p!=pe; ++p) p->flags=FLAGS_REMOTE; } - else for(p=pb; p!=pe; ++p) *out=*p,out->flags=FLAGS_REMOTE,++out; - } - sh->n = out - ((struct shared_id*)sh->ptr); - - /* set primary_shared_id flags to match */ - ((struct shared_id*)sh->ptr)[sh->n].i = UINT_MAX; - sarray_sort(struct shared_id,sh->ptr,sh->n, id,1, buf); - sarray_sort(struct primary_shared_id,pr->ptr,pr->n, id,1, buf); - q=pr->ptr; - for(pb=sh->ptr,e=pb+sh->n;pb!=e;pb=pe) { - uint i=pb->i; - pe=pb; while(pe->i==i) ++pe; - if(q->id!=pb->id) printf("FAIL!!!\n"); - q->flag=pb->flags&FLAGS_LOCAL; - ++q; - } -} - -/*------------------------------------------------------------------------------ - Local setup -------------------------------------------------------------------------------*/ - -/* assumes nz is sorted by primary, then flag, then index */ -static const uint *local_map(const struct array *nz, const int ignore_flagged, - uint *mem_size) -{ - uint *map, *p, count = 1; - const struct nonzero_id *row, *other, *end; -#define DO_COUNT(cond) do \ - for(row=nz->ptr,end=row+nz->n;row!=end;) { \ - ulong row_id = row->id; int any=0; \ - for(other=row+1;other!=end&&other->id==row_id&&cond;++other) \ - any=2, ++count; \ - count+=any, row=other; \ - } while(0) - if(ignore_flagged) DO_COUNT(other->flag==0); else DO_COUNT(1); -#undef DO_COUNT - p = map = tmalloc(uint,count); *mem_size += count*sizeof(uint); -#define DO_SET(cond) do \ - for(row=nz->ptr,end=row+nz->n;row!=end;) { \ - ulong row_id = row->id; int any=0; \ - *p++ = row->i; \ - for(other=row+1;other!=end&&other->id==row_id&&cond;++other) \ - any=1, *p++ = other->i; \ - if(any) *p++ = UINT_MAX; else --p; \ - row=other; \ - } while(0) - if(ignore_flagged) DO_SET(other->flag==0); else DO_SET(1); -#undef DO_SET - *p = UINT_MAX; - return map; -} - -static const uint *flagged_primaries_map(const struct array *nz, uint *mem_size) -{ - uint *map, *p, count=1; - const struct nonzero_id *row, *end; - for(row=nz->ptr,end=row+nz->n;row!=end;++row) - if(row->i==row->primary && row->flag==1) ++count; - p = map = tmalloc(uint,count); *mem_size += count*sizeof(uint); - for(row=nz->ptr,end=row+nz->n;row!=end;++row) - if(row->i==row->primary && row->flag==1) *p++ = row->i; - *p = UINT_MAX; - return map; -} - -/*------------------------------------------------------------------------------ - Remote execution and setup -------------------------------------------------------------------------------*/ - -typedef void exec_fun( - void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, - unsigned transpose, const void *execdata, const struct comm *comm, char *buf); -typedef void fin_fun(void *data); - -struct gs_remote { - uint buffer_size, mem_size; - void *data; - exec_fun *exec; - exec_fun *exec_irecv; - exec_fun *exec_isend; - exec_fun *exec_wait; - fin_fun *fin; -}; - -typedef void setup_fun(struct gs_remote *r, struct gs_topology *top, - const struct comm *comm, buffer *buf); - -/*------------------------------------------------------------------------------ - Pairwise Execution -------------------------------------------------------------------------------*/ -struct pw_comm_data { - uint n; /* number of messages */ - uint *p; /* message source/dest proc */ - uint *size; /* size of message */ - uint total; /* sum of message sizes */ -}; - -struct pw_data { - struct pw_comm_data comm[2]; - const uint *map[2]; - comm_req *req; - uint buffer_size; -}; - -static char *pw_exec_recvs(char *buf, const unsigned unit_size, - const struct comm *comm, - const struct pw_comm_data *c, comm_req *req) -{ - const uint *p, *pe, *size=c->size; - for(p=c->p,pe=p+c->n;p!=pe;++p) { - size_t len = *(size++)*unit_size; - comm_irecv(req++,comm,buf,len,*p,*p); - buf += len; - } - return buf; -} - -static char *pw_exec_sends(char *buf, const unsigned unit_size, - const struct comm *comm, - const struct pw_comm_data *c, comm_req *req) -{ - const uint *p, *pe, *size=c->size; - for(p=c->p,pe=p+c->n;p!=pe;++p) { - size_t len = *(size++)*unit_size; - comm_isend(req++,comm,buf,len,*p,comm->id); - buf += len; - } - return buf; -} - -static void pw_exec( - void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, - unsigned transpose, const void *execdata, const struct comm *comm, char *buf) -{ - const struct pw_data *pwd = execdata; - static gs_scatter_fun *const scatter_to_buf[] = - { &gs_scatter, &gs_scatter_vec, &gs_scatter_many_to_vec, &scatter_noop }; - static gs_gather_fun *const gather_from_buf[] = - { &gs_gather, &gs_gather_vec, &gs_gather_vec_to_many, &gather_noop }; - const unsigned recv = 0^transpose, send = 1^transpose; - unsigned unit_size = vn*gs_dom_size[dom]; - char *sendbuf; - /* post receives */ - sendbuf = pw_exec_recvs(buf,unit_size,comm,&pwd->comm[recv],pwd->req); - /* fill send buffer */ - scatter_to_buf[mode](sendbuf,data,vn,pwd->map[send],dom); - /* post sends */ - pw_exec_sends(sendbuf,unit_size,comm,&pwd->comm[send], - &pwd->req[pwd->comm[recv].n]); - comm_wait(pwd->req,pwd->comm[0].n+pwd->comm[1].n); - /* gather using recv buffer */ - gather_from_buf[mode](data,buf,vn,pwd->map[recv],dom,op); -} - -/*------------------------------------------------------------------------------ - Nonblocking Pairwise Execution -------------------------------------------------------------------------------*/ -static void pw_exec_irecv( - void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, - unsigned transpose, const void *execdata, const struct comm *comm, char *buf) -{ - const struct pw_data *pwd = execdata; - // static gs_scatter_fun *const scatter_to_buf[] = - // { &gs_scatter, &gs_scatter_vec, &gs_scatter_many_to_vec, &scatter_noop }; - // static gs_gather_fun *const gather_from_buf[] = - // { &gs_gather, &gs_gather_vec, &gs_gather_vec_to_many, &gather_noop }; - const unsigned recv = 0^transpose; // send = 1^transpose; - unsigned unit_size = vn*gs_dom_size[dom]; - /* post receives */ - (void) pw_exec_recvs(buf,unit_size,comm,&pwd->comm[recv],pwd->req); -} - -static void pw_exec_isend( - void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, - unsigned transpose, const void *execdata, const struct comm *comm, char *buf) -{ - const struct pw_data *pwd = execdata; - static gs_scatter_fun *const scatter_to_buf[] = - { &gs_scatter, &gs_scatter_vec, &gs_scatter_many_to_vec, &scatter_noop }; - // static gs_gather_fun *const gather_from_buf[] = - // { &gs_gather, &gs_gather_vec, &gs_gather_vec_to_many, &gather_noop }; - const unsigned recv = 0^transpose, send = 1^transpose; - unsigned unit_size = vn*gs_dom_size[dom]; - - /* fill send buffer */ - char *sendbuf = buf+unit_size*pwd->comm[recv].total; - scatter_to_buf[mode](sendbuf,data,vn,pwd->map[send],dom); - /* post sends */ - pw_exec_sends(sendbuf,unit_size,comm,&pwd->comm[send], - &pwd->req[pwd->comm[recv].n]); -} - -static void pw_exec_wait( - void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, - unsigned transpose, const void *execdata, const struct comm *comm, char *buf) -{ - const struct pw_data *pwd = execdata; - // static gs_scatter_fun *const scatter_to_buf[] = - // { &gs_scatter, &gs_scatter_vec, &gs_scatter_many_to_vec, &scatter_noop }; - static gs_gather_fun *const gather_from_buf[] = - { &gs_gather, &gs_gather_vec, &gs_gather_vec_to_many, &gather_noop }; - const unsigned recv = 0^transpose; // send = 1^transpose; - // unsigned unit_size = vn*gs_dom_size[dom]; - - comm_wait(pwd->req,pwd->comm[0].n+pwd->comm[1].n); - /* gather using recv buffer */ - gather_from_buf[mode](data,buf,vn,pwd->map[recv],dom,op); -} - -/*------------------------------------------------------------------------------ - Pairwise setup -------------------------------------------------------------------------------*/ -static uint pw_comm_setup(struct pw_comm_data *data, struct array *sh, - const unsigned flags_mask, buffer *buf) -{ - uint n=0,count=0, lp=UINT_MAX, mem_size=0; - struct shared_id *s, *se; - /* sort by remote processor and id (a globally consistent ordering) */ - sarray_sort_2(struct shared_id,sh->ptr,sh->n, p,0, id,1, buf); - /* assign index into buffer */ - for(s=sh->ptr,se=s+sh->n;s!=se;++s) { - if(s->flags&flags_mask) { s->bi = UINT_MAX; continue; } - s->bi = count++; - if(s->p!=lp) lp=s->p, ++n; - } - data->n = n; - data->p = tmalloc(uint,2*n); mem_size+=2*n*sizeof(uint); - data->size = data->p + n; - data->total = count; - n = 0, lp=UINT_MAX; - for(s=sh->ptr,se=s+sh->n;s!=se;++s) { - if(s->flags&flags_mask) continue; - if(s->p!=lp) { - lp=s->p; - if(n!=0) data->size[n-1] = count; - count=0, data->p[n++]=lp; - } - ++count; - } - if(n!=0) data->size[n-1] = count; - return mem_size; -} - -static void pw_comm_free(struct pw_comm_data *data) { free(data->p); } - -/* assumes that the bi field of sh is set */ -static const uint *pw_map_setup(struct array *sh, buffer *buf, uint *mem_size) -{ - uint count=0, *map, *p; - struct shared_id *s, *se; - sarray_sort(struct shared_id,sh->ptr,sh->n, i,0, buf); - /* calculate map size */ - count=1; - for(s=sh->ptr,se=s+sh->n;s!=se;) { - uint i=s->i; - if(s->bi==UINT_MAX) { ++s; continue; } - count+=3; - for(++s;s!=se&&s->i==i;++s) if(s->bi!=UINT_MAX) ++count; - } - /* write map */ - p = map = tmalloc(uint,count); *mem_size += count*sizeof(uint); - for(s=sh->ptr,se=s+sh->n;s!=se;) { - uint i=s->i; - if(s->bi==UINT_MAX) { ++s; continue; } - *p++ = i, *p++ = s->bi; - for(++s;s!=se&&s->i==i;++s) if(s->bi!=UINT_MAX) *p++ = s->bi; - *p++ = UINT_MAX; - } - *p = UINT_MAX; - return map; -} - -static struct pw_data *pw_setup_aux(struct array *sh, buffer *buf, - uint *mem_size) -{ - struct pw_data *pwd = tmalloc(struct pw_data,1); - *mem_size = sizeof(struct pw_data); - - /* default behavior: receive only remotely unflagged data */ - *mem_size+=pw_comm_setup(&pwd->comm[0],sh, FLAGS_REMOTE, buf); - pwd->map[0] = pw_map_setup(sh, buf, mem_size); - - /* default behavior: send only locally unflagged data */ - *mem_size+=pw_comm_setup(&pwd->comm[1],sh, FLAGS_LOCAL, buf); - pwd->map[1] = pw_map_setup(sh, buf, mem_size); - - pwd->req = tmalloc(comm_req,pwd->comm[0].n+pwd->comm[1].n); - *mem_size += (pwd->comm[0].n+pwd->comm[1].n)*sizeof(comm_req); - pwd->buffer_size = pwd->comm[0].total + pwd->comm[1].total; - return pwd; -} - -static void pw_free(struct pw_data *data) -{ - pw_comm_free(&data->comm[0]); - pw_comm_free(&data->comm[1]); - free((uint*)data->map[0]); - free((uint*)data->map[1]); - free(data->req); - free(data); -} - -static void pw_setup(struct gs_remote *r, struct gs_topology *top, - const struct comm *comm, buffer *buf) -{ - struct pw_data *pwd = pw_setup_aux(&top->sh,buf, &r->mem_size); - r->buffer_size = pwd->buffer_size; - r->data = pwd; - r->exec = (exec_fun*)&pw_exec; - r->exec_irecv = (exec_fun*)&pw_exec_irecv; - r->exec_isend = (exec_fun*)&pw_exec_isend; - r->exec_wait = (exec_fun*)&pw_exec_wait; - r->fin = (fin_fun*)&pw_free; -} - -/*------------------------------------------------------------------------------ - Crystal-Router Execution -------------------------------------------------------------------------------*/ -struct cr_stage { - const uint *scatter_map, *gather_map; - uint size_r, size_r1, size_r2; - uint size_sk, size_s, size_total; - uint p1, p2; - unsigned nrecvn; -}; - -struct cr_data { - struct cr_stage *stage[2]; - unsigned nstages; - uint buffer_size, stage_buffer_size; -}; - -static void cr_exec( - void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, - unsigned transpose, const void *execdata, const struct comm *comm, char *buf) -{ - const struct cr_data *crd = execdata; - static gs_scatter_fun *const scatter_user_to_buf[] = - { &gs_scatter, &gs_scatter_vec, &gs_scatter_many_to_vec, &scatter_noop }; - static gs_scatter_fun *const scatter_buf_to_buf[] = - { &gs_scatter, &gs_scatter_vec, &gs_scatter_vec, &gs_scatter }; - static gs_scatter_fun *const scatter_buf_to_user[] = - { &gs_scatter, &gs_scatter_vec, &gs_scatter_vec_to_many, &scatter_noop }; - static gs_gather_fun *const gather_buf_to_user[] = - { &gs_gather, &gs_gather_vec, &gs_gather_vec_to_many, &gather_noop }; - static gs_gather_fun *const gather_buf_to_buf[] = - { &gs_gather, &gs_gather_vec, &gs_gather_vec, &gs_gather }; - const unsigned unit_size = vn*gs_dom_size[dom], nstages=crd->nstages; - unsigned k; - char *sendbuf, *buf_old, *buf_new; - const struct cr_stage *stage = crd->stage[transpose]; - buf_old = buf; - buf_new = buf_old + unit_size*crd->stage_buffer_size; - /* crystal router */ - for(k=0;knp+k); - if(stage[k].nrecvn==2) - comm_irecv(&req[2],comm,buf_new+unit_size*stage[k].size_r1, - unit_size*stage[k].size_r2, stage[k].p2, comm->np+k); - sendbuf = buf_new+unit_size*stage[k].size_r; - if(k==0) - scatter_user_to_buf[mode](sendbuf,data,vn,stage[0].scatter_map,dom); - else - scatter_buf_to_buf[mode](sendbuf,buf_old,vn,stage[k].scatter_map,dom), - gather_buf_to_buf [mode](sendbuf,buf_old,vn,stage[k].gather_map ,dom,op); - - comm_isend(&req[0],comm,sendbuf,unit_size*stage[k].size_s, - stage[k].p1, comm->np+k); - comm_wait(&req[0],1+stage[k].nrecvn); - { char *t = buf_old; buf_old=buf_new; buf_new=t; } - } - scatter_buf_to_user[mode](data,buf_old,vn,stage[k].scatter_map,dom); - gather_buf_to_user [mode](data,buf_old,vn,stage[k].gather_map ,dom,op); -} - -/*------------------------------------------------------------------------------ - Crystal-Router setup -------------------------------------------------------------------------------*/ -static uint cr_schedule(struct cr_data *data, const struct comm *comm) -{ - uint mem_size = 0; - const uint id = comm->id; - uint bl=0, n=comm->np; - unsigned k = 0; - while(n>1) { - uint nl = (n+1)/2, bh = bl+nl; - if(idnstages = k; - data->stage[0] = tmalloc(struct cr_stage,2*(k+1)); - data->stage[1] = data->stage[0] + (k+1); - mem_size += 2*(k+1)*sizeof(struct cr_stage); - bl=0, n=comm->np, k=0; - while(n>1) { - uint nl = (n+1)/2, bh = bl+nl; - uint targ; unsigned recvn; - recvn = 1, targ = n-1-(id-bl)+bl; - if(id==targ) targ=bh, recvn=0; - if(n&1 && id==bh) recvn=2; - data->stage[1][k].nrecvn=data->stage[0][k].nrecvn=recvn; - data->stage[1][k].p1 =data->stage[0][k].p1 =targ; - data->stage[1][k].p2 =data->stage[0][k].p2 =comm->id-1; - if(idmax; - struct crl_id *w = cw->ptr; - struct shared_id *s, *se; - -#define CW_ADD(aid,ap,ari,asi) do { \ - if(cw_n==cw_max) \ - array_reserve(struct crl_id,cw,cw_n+1),cw_max=cw->max, \ - w=(struct crl_id*)cw->ptr+cw_n; \ - w->id=aid, w->p=ap, w->ri=ari, w->si=asi; \ - ++w, ++cw_n; \ - } while(0) - - for(s=sh->ptr,se=s+sh->n;s!=se;++s) { - int send = (s->flags&send_mask)==0; - int recv = (s->flags&recv_mask)==0; - if(s->i!=last_i) last_i=s->i, added_myself=0; - if(!added_myself && recv && (s->flags&FLAGS_LOCAL)==0) { - added_myself=1; - CW_ADD(s->id,this_p,s->i,s->i); - } - if(send) CW_ADD(s->id,s->p,s->ri,s->i); - } - cw->n=cw_n; -#undef CW_ADD -} - -static uint crl_maps(struct cr_stage *stage, struct array *cw, buffer *buf) -{ - uint mem_size=0; - struct crl_id *w, *we, *other; - uint scount=1, gcount=1, *sp, *gp; - sarray_sort_2(struct crl_id,cw->ptr,cw->n, bi,0, si,0, buf); - for(w=cw->ptr,we=w+cw->n;w!=we;w=other) { - uint bi=w->bi,any=0,si=w->si; - scount+=3; - for(other=w+1;other!=we&&other->bi==bi;++other) - if(other->si!=si) si=other->si, any=2, ++gcount; - gcount+=any; - } - stage->scatter_map = sp = tmalloc(uint,scount+gcount); - stage->gather_map = gp = sp + scount; - mem_size += (scount+gcount)*sizeof(uint); - for(w=cw->ptr,we=w+cw->n;w!=we;w=other) { - uint bi=w->bi,any=0,si=w->si; - *sp++ = w->si, *sp++ = bi; - *gp++ = bi; - for(other=w+1;other!=we&&other->bi==bi;++other) - if(other->si!=si) si=other->si, any=1, *gp++ = si; - if(any) *gp++ = UINT_MAX; else --gp; - *sp++ = UINT_MAX; - } - *sp=UINT_MAX, *gp=UINT_MAX; - return mem_size; -} - -static uint crl_work_label(struct array *cw, struct cr_stage *stage, - uint cutoff, int send_hi, buffer *buf, - uint *mem_size) -{ - struct crl_id *w, *we, *start; - uint nsend, nkeep = 0, nks = 0, bi=0; - /* here w->send has a reverse meaning */ - if(send_hi) for(w=cw->ptr,we=w+cw->n;w!=we;++w) w->send = w->p< cutoff; - else for(w=cw->ptr,we=w+cw->n;w!=we;++w) w->send = w->p>=cutoff; - sarray_sort_2(struct crl_id,cw->ptr,cw->n, id,1, send,0, buf); - for(start=cw->ptr,w=start,we=w+cw->n;w!=we;++w) { - nkeep += w->send; - if(w->id!=start->id) start=w; - if(w->send!=start->send) w->send=0,w->bi=1, ++nks; else w->bi=0; - } - nsend = cw->n-nkeep; - /* assign indices; sent ids have priority (hence w->send is reversed) */ - sarray_sort(struct crl_id,cw->ptr,cw->n, send,0, buf); - for(start=cw->ptr,w=start,we=w+nsend+nks;w!=we;++w) { - if(w->id!=start->id) start=w, ++bi; - if(w->bi!=1) w->send=1; /* switch back to the usual semantics */ - w->bi = bi; - } - stage->size_s = nsend+nks==0 ? 0 : bi+1; - for(we=(struct crl_id*)cw->ptr+cw->n;w!=we;++w) { - if(w->id!=start->id) start=w, ++bi; - w->send = 0; /* switch back to the usual semantics */ - w->bi = bi; - } - stage->size_sk = cw->n==0 ? 0 : bi+1; - *mem_size += crl_maps(stage,cw,buf); - return nsend; -} - -static void crl_bi_to_si(struct crl_id *w, uint n, uint v) { - for(;n;--n) w->si=w->bi+v, ++w; -} - -static void crl_ri_to_bi(struct crl_id *w, uint n) { - for(;n;--n) w->bi=w->ri, ++w; -} - -static uint cr_learn(struct array *cw, struct cr_stage *stage, - const struct comm *comm, buffer *buf, uint *mem_size) -{ - comm_req req[3]; - const uint id = comm->id; - uint bl=0, n=comm->np; - uint size_max=0; - uint tag = comm->np; - while(n>1) { - uint nl = (n+1)/2, bh = bl+nl; - uint nkeep, nsend[2], nrecv[2][2] = {{0,0},{0,0}}; - struct crl_id *wrecv[2], *wsend; - nsend[0] = crl_work_label(cw,stage,bh,idsize_s; - nkeep = cw->n - nsend[0]; - - if(stage->nrecvn ) comm_irecv(&req[1],comm,nrecv[0],2*sizeof(uint), - stage->p1,tag); - if(stage->nrecvn==2) comm_irecv(&req[2],comm,nrecv[1],2*sizeof(uint), - stage->p2,tag); - comm_isend(&req[0],comm,nsend,2*sizeof(uint),stage->p1,tag); - comm_wait(req,1+stage->nrecvn),++tag; - - stage->size_r1 = nrecv[0][1], stage->size_r2 = nrecv[1][1]; - stage->size_r = stage->size_r1 + stage->size_r2; - stage->size_total = stage->size_r + stage->size_sk; - if(stage->size_total>size_max) size_max=stage->size_total; - - array_reserve(struct crl_id,cw,cw->n+nrecv[0][0]+nrecv[1][0]); - wrecv[0] = cw->ptr, wrecv[0] += cw->n, wrecv[1] = wrecv[0]+nrecv[0][0]; - wsend = cw->ptr, wsend += nkeep; - if(stage->nrecvn ) - comm_irecv(&req[1],comm,wrecv[0],nrecv[0][0]*sizeof(struct crl_id), - stage->p1,tag); - if(stage->nrecvn==2) - comm_irecv(&req[2],comm,wrecv[1],nrecv[1][0]*sizeof(struct crl_id), - stage->p2,tag); - sarray_sort_2(struct crl_id,cw->ptr,cw->n, send,0, bi,0, buf); - comm_isend(&req[0],comm,wsend,nsend[0]*sizeof(struct crl_id),stage->p1,tag); - comm_wait(req,1+stage->nrecvn),++tag; - - crl_bi_to_si(cw->ptr,nkeep,stage->size_r); - if(stage->nrecvn) crl_bi_to_si(wrecv[0],nrecv[0][0],0); - if(stage->nrecvn==2) crl_bi_to_si(wrecv[1],nrecv[1][0],stage->size_r1); - memmove(wsend,wrecv[0],(nrecv[0][0]+nrecv[1][0])*sizeof(struct crl_id)); - cw->n += nrecv[0][0] + nrecv[1][0]; - cw->n -= nsend[0]; - - if(idptr,cw->n); - *mem_size += crl_maps(stage,cw,buf); - return size_max; -} - -static struct cr_data *cr_setup_aux( - struct array *sh, const struct comm *comm, buffer *buf, uint *mem_size) -{ - uint size_max[2]; - struct array cw = null_array; - struct cr_data *crd = tmalloc(struct cr_data,1); - *mem_size = sizeof(struct cr_data); - - /* default behavior: receive only remotely unflagged data */ - /* default behavior: send only locally unflagged data */ - - *mem_size += cr_schedule(crd,comm); - - sarray_sort(struct shared_id,sh->ptr,sh->n, i,0, buf); - crl_work_init(&cw,sh, FLAGS_LOCAL , comm->id); - size_max[0]=cr_learn(&cw,crd->stage[0],comm,buf, mem_size); - crl_work_init(&cw,sh, FLAGS_REMOTE, comm->id); - size_max[1]=cr_learn(&cw,crd->stage[1],comm,buf, mem_size); - - crd->stage_buffer_size = size_max[1]>size_max[0]?size_max[1]:size_max[0]; - - array_free(&cw); - - crd->buffer_size = 2*crd->stage_buffer_size; - return crd; -} - -static void cr_free_stage_maps(struct cr_stage *stage, unsigned kmax) -{ - unsigned k; - for(k=0; kscatter_map); - ++stage; - } - free((uint*)stage->scatter_map); -} - -static void cr_free(struct cr_data *data) -{ - cr_free_stage_maps(data->stage[0],data->nstages); - cr_free_stage_maps(data->stage[1],data->nstages); - free(data->stage[0]); - free(data); -} - -static void cr_setup(struct gs_remote *r, struct gs_topology *top, - const struct comm *comm, buffer *buf) -{ - struct cr_data *crd = cr_setup_aux(&top->sh,comm,buf, &r->mem_size); - r->buffer_size = crd->buffer_size; - r->data = crd; - r->exec = (exec_fun*)&cr_exec; - r->fin = (fin_fun*)&cr_free; -} - -/*------------------------------------------------------------------------------ - All-reduce Execution -------------------------------------------------------------------------------*/ -struct allreduce_data { - const uint *map_to_buf[2], *map_from_buf[2]; - uint buffer_size; - comm_req *req; -}; - -static void allreduce_exec( - void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, - unsigned transpose, const void *execdata, const struct comm *comm, char *buf) -{ - const struct allreduce_data *ard = execdata; - static gs_scatter_fun *const scatter_to_buf[] = - { &gs_scatter, &gs_scatter_vec, &gs_scatter_many_to_vec, &scatter_noop }; - static gs_scatter_fun *const scatter_from_buf[] = - { &gs_scatter, &gs_scatter_vec, &gs_scatter_vec_to_many, &scatter_noop }; - uint gvn = vn*(ard->buffer_size/2); - unsigned unit_size = gs_dom_size[dom]; - char *ardbuf; - ardbuf = buf+unit_size*gvn; - /* user array -> buffer */ - gs_init_array(buf,gvn,dom,op); - scatter_to_buf[mode](buf,data,vn,ard->map_to_buf[transpose],dom); - /* all reduce */ - comm_allreduce(comm,dom,op, buf,gvn, ardbuf); - /* buffer -> user array */ - scatter_from_buf[mode](data,buf,vn,ard->map_from_buf[transpose],dom); -} - -/*------------------------------------------------------------------------------ - Nonblocking All-reduce Execution -------------------------------------------------------------------------------*/ -static void allreduce_exec_i( - void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, - unsigned transpose, const void *execdata, const struct comm *comm, char *buf) -{ - const struct allreduce_data *ard = execdata; - static gs_scatter_fun *const scatter_to_buf[] = - { &gs_scatter, &gs_scatter_vec, &gs_scatter_many_to_vec, &scatter_noop }; - // static gs_scatter_fun *const scatter_from_buf[] = - // { &gs_scatter, &gs_scatter_vec, &gs_scatter_vec_to_many, &scatter_noop }; - uint gvn = vn*(ard->buffer_size/2); - unsigned unit_size = gs_dom_size[dom]; - char *ardbuf = buf+unit_size*gvn; - - /* user array -> buffer */ - gs_init_array(buf,gvn,dom,op); - scatter_to_buf[mode](buf,data,vn,ard->map_to_buf[transpose],dom); - /* all reduce */ - comm_iallreduce(ard->req,comm,dom,op,buf,gvn,ardbuf); -} - -static void allreduce_exec_wait( - void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, - unsigned transpose, const void *execdata, const struct comm *comm, char *buf) -{ - const struct allreduce_data *ard = execdata; - // static gs_scatter_fun *const scatter_to_buf[] = - // { &gs_scatter, &gs_scatter_vec, &gs_scatter_many_to_vec, &scatter_noop }; - static gs_scatter_fun *const scatter_from_buf[] = - { &gs_scatter, &gs_scatter_vec, &gs_scatter_vec_to_many, &scatter_noop }; - uint gvn = vn*(ard->buffer_size/2); - unsigned unit_size = gs_dom_size[dom]; - char *ardbuf = buf+unit_size*gvn; - - // Why do I need this? Ugly !!! - if (comm->np > 1) - comm_wait(ard->req, 1); -#ifdef MPI - memcpy(buf,ardbuf,gvn*gs_dom_size[dom]); -#endif - /* buffer -> user array */ - scatter_from_buf[mode](data,buf,vn,ard->map_from_buf[transpose],dom); -} - -/*------------------------------------------------------------------------------ - All-reduce setup -------------------------------------------------------------------------------*/ -static const uint *allreduce_map_setup( - struct array *pr, const unsigned flags_mask, int to_buf, uint *mem_size) -{ - struct primary_shared_id *p, *pe; - uint count=1, *map, *m; - for(p=pr->ptr,pe=p+pr->n;p!=pe;++p) - if((p->flag&flags_mask)==0) count+=3; - m=map=tmalloc(uint,count); *mem_size += count*sizeof(uint); - if(to_buf) { - for(p=pr->ptr,pe=p+pr->n;p!=pe;++p) - if((p->flag&flags_mask)==0) - *m++ = p->i, *m++ = p->ord, *m++ = UINT_MAX; - } else { - for(p=pr->ptr,pe=p+pr->n;p!=pe;++p) - if((p->flag&flags_mask)==0) - *m++ = p->ord, *m++ = p->i, *m++ = UINT_MAX; - } - *m=UINT_MAX; - return map; -} - -static struct allreduce_data *allreduce_setup_aux( - struct array *pr, ulong total_shared, uint *mem_size) -{ - struct allreduce_data *ard = tmalloc(struct allreduce_data,1); - *mem_size = sizeof(struct allreduce_data); - - /* default behavior: reduce only unflagged data, copy to all */ - ard->map_to_buf [0] = allreduce_map_setup(pr,1,1, mem_size); - ard->map_from_buf[0] = allreduce_map_setup(pr,0,0, mem_size); - - /* transpose behavior: reduce all data, copy to unflagged */ - ard->map_to_buf [1] = allreduce_map_setup(pr,0,1, mem_size); - ard->map_from_buf[1] = allreduce_map_setup(pr,1,0, mem_size); - - ard->req = tmalloc(comm_req, 1); - - ard->buffer_size = total_shared*2; - return ard; -} - -static void allreduce_free(struct allreduce_data *ard) -{ - free((uint*)ard->map_to_buf[0]); - free((uint*)ard->map_to_buf[1]); - free((uint*)ard->map_from_buf[0]); - free((uint*)ard->map_from_buf[1]); - free(ard); -} - -static void allreduce_setup(struct gs_remote *r, struct gs_topology *top, - const struct comm *comm, buffer *buf) -{ - struct allreduce_data *ard - = allreduce_setup_aux(&top->pr,top->total_shared, &r->mem_size); - r->buffer_size = ard->buffer_size; - r->data = ard; - r->exec = (exec_fun*)&allreduce_exec; - r->exec_irecv = (exec_fun*)&allreduce_exec_i; - r->exec_isend = NULL; - r->exec_wait = (exec_fun*)&allreduce_exec_wait; - r->fin = (fin_fun*)&allreduce_free; -} - -/*------------------------------------------------------------------------------ - Automatic Setup --- dynamically picks the fastest method -------------------------------------------------------------------------------*/ - -static void dry_run_time(double times[3], const struct gs_remote *r, - const struct comm *comm, buffer *buf) -{ - int i; double t; - buffer_reserve(buf,gs_dom_size[gs_double]*r->buffer_size); - for(i= 2;i;--i) - r->exec(0,mode_dry_run,1,gs_double,gs_add,0,r->data,comm,buf->ptr); - comm_barrier(comm); - t = comm_time(); - for(i=10;i;--i) - r->exec(0,mode_dry_run,1,gs_double,gs_add,0,r->data,comm,buf->ptr); - t = (comm_time() - t)/10; - times[0] = t/comm->np, times[1] = t, times[2] = t; - comm_allreduce(comm,gs_double,gs_add, ×[0],1, &t); - comm_allreduce(comm,gs_double,gs_min, ×[1],1, &t); - comm_allreduce(comm,gs_double,gs_max, ×[2],1, &t); -} - -static void auto_setup(struct gs_remote *r, struct gs_topology *top, - const struct comm *comm, buffer *buf) -{ - pw_setup(r, top,comm,buf); - - if(comm->np>1) { - // const char *name = "pairwise"; - struct gs_remote r_alt; - double time[2][3]; - -#if 0 - #define DRY_RUN(i,gsr,str) do { \ - if(comm->id==0) printf(" " str ": "); \ - dry_run_time(time[i],gsr,comm,buf); \ - if(comm->id==0) \ - printf("%g %g %g\n",time[i][0],time[i][1],time[i][2]); \ - } while(0) -#endif - - #define DRY_RUN(i,gsr,str) do { \ - dry_run_time(time[i],gsr,comm,buf); \ - } while(0) - - #define DRY_RUN_CHECK(str,new_name) do { \ - DRY_RUN(1,&r_alt,str); \ - if(time[1][2]fin(r->data), *r = r_alt; \ - else \ - r_alt.fin(r_alt.data); \ - } while(0) - - DRY_RUN(0, r, "pairwise times (avg, min, max)"); - - cr_setup(&r_alt, top,comm,buf); - DRY_RUN_CHECK( "crystal router ", "crystal router"); - - if(top->total_shared<100000) { - allreduce_setup(&r_alt, top,comm,buf); - DRY_RUN_CHECK( "all reduce ", "allreduce"); - } - - #undef DRY_RUN_CHECK - #undef DRY_RUN - - // if(comm->id==0) printf(" used all_to_all method: %s\n",name); - } -} - -/*------------------------------------------------------------------------------ - Main Execution -------------------------------------------------------------------------------*/ -struct gs_data { - struct comm comm; - const uint *map_local[2]; /* 0=unflagged, 1=all */ - const uint *flagged_primaries; - struct gs_remote r; - uint handle_size; -}; - -/*------------------------------------------------------------------------------ - GS_AUX - blocking and non-blocking -------------------------------------------------------------------------------*/ -static void gs_aux( - void *u, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, unsigned transpose, - struct gs_data *gsh, buffer *buf) -{ - static gs_scatter_fun *const local_scatter[] = - { &gs_scatter, &gs_scatter_vec, &gs_scatter_many, &scatter_noop }; - static gs_gather_fun *const local_gather [] = - { &gs_gather, &gs_gather_vec, &gs_gather_many, &gather_noop }; - static gs_init_fun *const init[] = - { &gs_init, &gs_init_vec, &gs_init_many, &init_noop }; - if(!buf) buf = &static_buffer; - buffer_reserve(buf,vn*gs_dom_size[dom]*gsh->r.buffer_size); - local_gather [mode](u,u,vn,gsh->map_local[0^transpose],dom,op); - if(transpose==0) init[mode](u,vn,gsh->flagged_primaries,dom,op); - gsh->r.exec(u,mode,vn,dom,op,transpose,gsh->r.data,&gsh->comm,buf->ptr); - local_scatter[mode](u,u,vn,gsh->map_local[1^transpose],dom); -} - -static void gs_aux_irecv( - void *u, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, unsigned transpose, - struct gs_data *gsh, buffer *buf) -{ - // static gs_scatter_fun *const local_scatter[] = - // { &gs_scatter, &gs_scatter_vec, &gs_scatter_many, &scatter_noop }; - static gs_gather_fun *const local_gather [] = - { &gs_gather, &gs_gather_vec, &gs_gather_many, &gather_noop }; - static gs_init_fun *const init[] = - { &gs_init, &gs_init_vec, &gs_init_many, &init_noop }; - if(!buf) buf = &static_buffer; - buffer_reserve(buf,vn*gs_dom_size[dom]*gsh->r.buffer_size); - local_gather [mode](u,u,vn,gsh->map_local[0^transpose],dom,op); - if(transpose==0) init[mode](u,vn,gsh->flagged_primaries,dom,op); - - if (gsh->r.exec_irecv) - gsh->r.exec_irecv(u,mode,vn,dom,op,transpose,gsh->r.data,&gsh->comm,buf->ptr); -} - -static void gs_aux_isend( - void *u, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, unsigned transpose, - struct gs_data *gsh, buffer *buf) -{ - // static gs_scatter_fun *const local_scatter[] = - // { &gs_scatter, &gs_scatter_vec, &gs_scatter_many, &scatter_noop }; - // static gs_gather_fun *const local_gather [] = - // { &gs_gather, &gs_gather_vec, &gs_gather_many, &gather_noop }; - // static gs_init_fun *const init[] = - // { &gs_init, &gs_init_vec, &gs_init_many, &init_noop }; - - if(!buf) buf = &static_buffer; - - if (gsh->r.exec_isend) - gsh->r.exec_isend(u,mode,vn,dom,op,transpose,gsh->r.data,&gsh->comm,buf->ptr); -} - -static void gs_aux_wait( - void *u, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, unsigned transpose, - struct gs_data *gsh, buffer *buf) -{ - static gs_scatter_fun *const local_scatter[] = - { &gs_scatter, &gs_scatter_vec, &gs_scatter_many, &scatter_noop }; - // static gs_gather_fun *const local_gather [] = - // { &gs_gather, &gs_gather_vec, &gs_gather_many, &gather_noop }; - // static gs_init_fun *const init[] = - // { &gs_init, &gs_init_vec, &gs_init_many, &init_noop }; - - if(!buf) buf = &static_buffer; - - if (gsh->r.exec_wait) - gsh->r.exec_wait(u,mode,vn,dom,op,transpose,gsh->r.data,&gsh->comm,buf->ptr); - - local_scatter[mode](u,u,vn,gsh->map_local[1^transpose],dom); -} - -/*------------------------------------------------------------------------------ - GS interface - blocking and non-blocking -------------------------------------------------------------------------------*/ -struct nonblocking_private { - void *u; - gs_mode mode; - gs_dom dom; - gs_op op; - unsigned transpose; - struct gs_data *gsh; - buffer *buf; - unsigned vn; -}; - -typedef struct nonblocking_private* nblkng; - -static nblkng *nblkng_dict; -static int nblkng_max = 0; -static int nblkng_n = 0; -static int nblkng_count = 0; - -void gs(void *u, gs_dom dom, gs_op op, unsigned transpose, - struct gs_data *gsh, buffer *buf) -{ - gs_aux(u,mode_plain,1,dom,op,transpose,gsh,buf); -} - -void igs(void *u, gs_dom dom, gs_op op, unsigned transpose, - struct gs_data *gsh, buffer *buf, int *handle) -{ - if(nblkng_n==nblkng_max) nblkng_max+=nblkng_max/2+1, - nblkng_dict=trealloc(nblkng,nblkng_dict,nblkng_max); - - nblkng_dict[nblkng_n] = tmalloc(struct nonblocking_private, 1); - - nblkng_dict[nblkng_n]->u = u; - nblkng_dict[nblkng_n]->dom = dom; - nblkng_dict[nblkng_n]->op = op; - nblkng_dict[nblkng_n]->transpose = transpose; - nblkng_dict[nblkng_n]->gsh = gsh; - nblkng_dict[nblkng_n]->buf = buf; - nblkng_dict[nblkng_n]->mode = mode_plain; - nblkng_dict[nblkng_n]->vn = 1; - - *handle = nblkng_n++; - nblkng_count++; - - gs_aux_irecv(u,mode_plain,1,dom,op,transpose,gsh,buf); - gs_aux_isend(u,mode_plain,1,dom,op,transpose,gsh,buf); -} - -void gs_wait(int handle) -{ - if(handle < nblkng_n) { - gs_aux_wait(nblkng_dict[handle]->u, - nblkng_dict[handle]->mode, - nblkng_dict[handle]->vn, - nblkng_dict[handle]->dom, - nblkng_dict[handle]->op, - nblkng_dict[handle]->transpose, - nblkng_dict[handle]->gsh, - nblkng_dict[handle]->buf); - free(nblkng_dict[handle]); - nblkng_dict[handle] = 0; - nblkng_count--; - } - - if(nblkng_count == 0) { - free(nblkng_dict); - nblkng_dict = 0; - nblkng_max = 0; - nblkng_n = 0; - } -} -/*------------------------------------------------------------------------------ - GS_VEC interface - blocking and non-blocking -------------------------------------------------------------------------------*/ -void gs_vec(void *u, unsigned vn, gs_dom dom, gs_op op, - unsigned transpose, struct gs_data *gsh, buffer *buf) -{ - gs_aux(u,mode_vec,vn,dom,op,transpose,gsh,buf); -} - -void igs_vec(void *u, unsigned vn, gs_dom dom, gs_op op, - unsigned transpose, struct gs_data *gsh, buffer *buf, int *handle) -{ - if(nblkng_n==nblkng_max) nblkng_max+=nblkng_max/2+1, - nblkng_dict=trealloc(nblkng,nblkng_dict,nblkng_max); - - nblkng_dict[nblkng_n] = tmalloc(struct nonblocking_private, 1); - - nblkng_dict[nblkng_n]->u = u; - nblkng_dict[nblkng_n]->dom = dom; - nblkng_dict[nblkng_n]->op = op; - nblkng_dict[nblkng_n]->transpose = transpose; - nblkng_dict[nblkng_n]->gsh = gsh; - nblkng_dict[nblkng_n]->buf = buf; - nblkng_dict[nblkng_n]->vn = vn; - nblkng_dict[nblkng_n]->mode = mode_vec; - - *handle = nblkng_n++; - nblkng_count++; - - gs_aux_irecv(u,mode_vec,vn,dom,op,transpose,gsh,buf); - gs_aux_isend(u,mode_vec,vn,dom,op,transpose,gsh,buf); -} -/*------------------------------------------------------------------------------ - GS_MANY interface - blocking and non-blocking -------------------------------------------------------------------------------*/ -void gs_many(void *const*u, unsigned vn, gs_dom dom, gs_op op, - unsigned transpose, struct gs_data *gsh, buffer *buf) -{ - gs_aux((void*)u,mode_many,vn,dom,op,transpose,gsh,buf); -} - -void igs_many(void *const*u, unsigned vn, gs_dom dom, gs_op op, - unsigned transpose, struct gs_data *gsh, buffer *buf, int *handle) -{ - if(nblkng_n==nblkng_max) nblkng_max+=nblkng_max/2+1, - nblkng_dict=trealloc(nblkng,nblkng_dict,nblkng_max); - - nblkng_dict[nblkng_n] = tmalloc(struct nonblocking_private, 1); - - nblkng_dict[nblkng_n]->u = (void *)u; - nblkng_dict[nblkng_n]->dom = dom; - nblkng_dict[nblkng_n]->op = op; - nblkng_dict[nblkng_n]->transpose = transpose; - nblkng_dict[nblkng_n]->gsh = gsh; - nblkng_dict[nblkng_n]->buf = buf; - nblkng_dict[nblkng_n]->vn = vn; - nblkng_dict[nblkng_n]->mode = mode_many; - - *handle = nblkng_n++; - nblkng_count++; - - gs_aux_irecv((void *)u,mode_many,vn,dom,op,transpose,gsh,buf); - gs_aux_isend((void *)u,mode_many,vn,dom,op,transpose,gsh,buf); -} - -/*------------------------------------------------------------------------------ - Main Setup -------------------------------------------------------------------------------*/ -typedef enum {gs_auto, gs_pairwise, gs_crystal_router, gs_all_reduce} gs_method; - -static uint local_setup(struct gs_data *gsh, const struct array *nz) -{ - uint mem_size = 0; - gsh->map_local[0] = local_map(nz,1, &mem_size); - gsh->map_local[1] = local_map(nz,0, &mem_size); - gsh->flagged_primaries = flagged_primaries_map(nz, &mem_size); - return mem_size; -} - -static void gs_setup_aux(struct gs_data *gsh, const slong *id, uint n, - int unique, gs_method method, int verbose) -{ - static setup_fun *const remote_setup[] = - { &auto_setup, &pw_setup, &cr_setup, &allreduce_setup }; - - struct gs_topology top; - struct crystal cr; - - crystal_init(&cr,&gsh->comm); - - get_topology(&top, id,n, &cr); - if(unique) make_topology_unique(&top,0,gsh->comm.id,&cr.data); - - gsh->handle_size = sizeof(struct gs_data); - gsh->handle_size += local_setup(gsh,&top.nz); - - if(verbose && gsh->comm.id==0) - printf("gs_setup: %ld unique labels shared\n",(long)top.total_shared); - - remote_setup[method](&gsh->r, &top,&gsh->comm,&cr.data); - gsh->handle_size += gsh->r.mem_size; - - if(verbose) { /* report memory usage */ - double avg[2],td[2]; uint min[2],max[2],ti[2]; - avg[0] = min[0] = max[0] = gsh->handle_size; - avg[1] = min[1] = max[1] = sizeof(double)*gsh->r.buffer_size; - avg[0] /= gsh->comm.np; avg[1] /= gsh->comm.np; - comm_allreduce(&gsh->comm,gs_double,gs_add, avg,2, td); - comm_allreduce(&gsh->comm,gs_sint,gs_min, min,2, ti); - comm_allreduce(&gsh->comm,gs_sint,gs_max, max,2, ti); - if(gsh->comm.id==0) { - printf(" " "handle bytes (avg, min, max)" ": " "%g %u %u\n", - avg[0], (unsigned)min[0], (unsigned)max[0]); - printf(" " "buffer bytes (avg, min, max)" ": " "%g %u %u\n", - avg[1], (unsigned)min[1], (unsigned)max[1]); - } - } - - gs_topology_free(&top); - crystal_free(&cr); -} - -struct gs_data *gs_setup(const slong *id, uint n, const struct comm *comm, - int unique, gs_method method, int verbose) -{ - struct gs_data *gsh = tmalloc(struct gs_data,1); - comm_dup(&gsh->comm,comm); - gs_setup_aux(gsh,id,n,unique,method,verbose); - return gsh; -} - -void gs_free(struct gs_data *gsh) -{ - comm_free(&gsh->comm); - free((uint*)gsh->map_local[0]), free((uint*)gsh->map_local[1]); - free((uint*)gsh->flagged_primaries); - gsh->r.fin(gsh->r.data); - free(gsh); -} - -void gs_unique(slong *id, uint n, const struct comm *comm) -{ - struct gs_topology top; - struct crystal cr; - crystal_init(&cr,comm); - get_topology(&top, id,n, &cr); - make_topology_unique(&top,id,comm->id,&cr.data); - gs_topology_free(&top); - crystal_free(&cr); -} - -/*------------------------------------------------------------------------------ - FORTRAN interface -------------------------------------------------------------------------------*/ - -#undef gs_op - -#undef gs_unique -#undef gs_free -#undef gs_setup -#undef gs_many -#undef gs_vec -#undef gs -#undef igs -#undef igs_vec -#undef igs_many -#undef gs_wait - -#define cgs PREFIXED_NAME(gs ) -#define cgs_vec PREFIXED_NAME(gs_vec ) -#define cgs_many PREFIXED_NAME(gs_many ) -#define cgs_setup PREFIXED_NAME(gs_setup) -#define cgs_free PREFIXED_NAME(gs_free ) -#define cgs_unique PREFIXED_NAME(gs_unique) -#define cigs PREFIXED_NAME(igs ) -#define cigs_vec PREFIXED_NAME(igs_vec ) -#define cigs_many PREFIXED_NAME(igs_many) -#define cgs_wait PREFIXED_NAME(gs_wait ) - -#define fgs_setup_pick FORTRAN_NAME(gs_setup_pick,GS_SETUP_PICK) -#define fgs_setup FORTRAN_NAME(gs_setup ,GS_SETUP ) -#define fgs FORTRAN_NAME(gs_op ,GS_OP ) -#define fgs_vec FORTRAN_NAME(gs_op_vec ,GS_OP_VEC ) -#define fgs_many FORTRAN_NAME(gs_op_many ,GS_OP_MANY ) -#define figs FORTRAN_NAME(igs_op ,IGS_OP ) -#define figs_vec FORTRAN_NAME(igs_op_vec ,IGS_OP_VEC ) -#define figs_many FORTRAN_NAME(igs_op_many ,IGS_OP_MANY ) -#define fgs_wait FORTRAN_NAME(gs_op_wait ,GS_OP_WAIT ) -#define fgs_fields FORTRAN_NAME(gs_op_fields ,GS_OP_FIELDS ) -#define fgs_free FORTRAN_NAME(gs_free ,GS_FREE ) -#define fgs_unique FORTRAN_NAME(gs_unique ,GS_UNIQUE ) - -static struct gs_data **fgs_info = 0; -static int fgs_max = 0; -static int fgs_n = 0; - -struct gs_data* gs_hf2c(const sint gsh) -{ - return fgs_info[gsh]; -} - -void fgs_setup_pick(sint *handle, const slong id[], const sint *n, - const MPI_Fint *comm, const sint *np, const sint *method) -{ - struct gs_data *gsh; - if(fgs_n==fgs_max) fgs_max+=fgs_max/2+1, - fgs_info=trealloc(struct gs_data*,fgs_info,fgs_max); - gsh=fgs_info[fgs_n]=tmalloc(struct gs_data,1); - comm_init_check(&gsh->comm,*comm,*np); - gs_setup_aux(gsh,id,*n,0,*method,1); - *handle = fgs_n++; -} - -void fgs_setup(sint *handle, const slong id[], const sint *n, - const MPI_Fint *comm, const sint *np) -{ - const sint method = gs_auto; - fgs_setup_pick(handle,id,n,comm,np,&method); -} - -static void fgs_check_handle(sint handle, const char *func, unsigned line) -{ - if(handle<0 || handle>=fgs_n || !fgs_info[handle]) - fail(1,__FILE__,line,"%s: invalid handle", func); -} - -static const gs_dom fgs_dom[4] = { 0, gs_double, gs_sint, gs_slong }; - -static void fgs_check_parms(sint handle, sint dom, sint op, - const char *func, unsigned line) -{ - if(dom<1 || dom>3) - fail(1,__FILE__,line,"%s: datatype %d not in valid range 1-3",func,dom); - if(op <1 || op >4) - fail(1,__FILE__,line,"%s: op %d not in valid range 1-4",func,op); - fgs_check_handle(handle,func,line); -} - -void fgs(const sint *handle, void *u, const sint *dom, const sint *op, - const sint *transpose) -{ - fgs_check_parms(*handle,*dom,*op,"gs_op",__LINE__); - cgs(u,fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0,fgs_info[*handle],0); -} - -void figs(const sint *handle, void *u, const sint *dom, const sint *op, - const sint *transpose, int *wait) -{ - fgs_check_parms(*handle,*dom,*op,"gs_op",__LINE__); - cigs(u,fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0,fgs_info[*handle],0,wait); -} - -void fgs_vec(const sint *handle, void *u, const sint *n, - const sint *dom, const sint *op, const sint *transpose) -{ - fgs_check_parms(*handle,*dom,*op,"gs_op_vec",__LINE__); - cgs_vec(u,*n,fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0, - fgs_info[*handle],0); -} - -void figs_vec(const sint *handle, void *u, const sint *n, - const sint *dom, const sint *op, const sint *transpose, int *wait) -{ - fgs_check_parms(*handle,*dom,*op,"gs_op_vec",__LINE__); - cigs_vec(u,*n,fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0, - fgs_info[*handle],0,wait); -} - -void fgs_many(const sint *handle, void *u1, void *u2, void *u3, - void *u4, void *u5, void *u6, const sint *n, - const sint *dom, const sint *op, const sint *transpose) -{ - void *uu[6]; - uu[0]=u1,uu[1]=u2,uu[2]=u3,uu[3]=u4,uu[4]=u5,uu[5]=u6; - fgs_check_parms(*handle,*dom,*op,"gs_op_many",__LINE__); - cgs_many((void *const*)uu,*n,fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0, - fgs_info[*handle],0); -} - -void figs_many(const sint *handle, void *u1, void *u2, void *u3, - void *u4, void *u5, void *u6, const sint *n, - const sint *dom, const sint *op, const sint *transpose, - int *wait) -{ - void *uu[6]; - uu[0]=u1,uu[1]=u2,uu[2]=u3,uu[3]=u4,uu[4]=u5,uu[5]=u6; - fgs_check_parms(*handle,*dom,*op,"gs_op_many",__LINE__); - cigs_many((void *const*)uu,*n,fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0, - fgs_info[*handle],0,wait); -} - -void fgs_wait(int *handle) -{ - cgs_wait(*handle); -} - -static struct array fgs_fields_array = null_array; - -void fgs_fields(const sint *handle, - void *u, const sint *stride, const sint *n, - const sint *dom, const sint *op, const sint *transpose) -{ - size_t offset; - void **p; - uint i; - - fgs_check_parms(*handle,*dom,*op,"gs_op_fields",__LINE__); - if(*n<0) return; - - array_reserve(void*,&fgs_fields_array,*n); - p = fgs_fields_array.ptr; - offset = *stride * gs_dom_size[*dom-1]; - for(i=*n;i;--i) *p++ = u, u = (char*)u + offset; - - cgs_many((void *const*)fgs_fields_array.ptr,*n, - fgs_dom[*dom],(gs_op_t)(*op-1), - *transpose!=0, fgs_info[*handle],0); -} - -void fgs_free(const sint *handle) -{ - fgs_check_handle(*handle,"gs_free",__LINE__); - cgs_free(fgs_info[*handle]); - fgs_info[*handle] = 0; -} - -void fgs_unique(slong id[], const sint *n, const MPI_Fint *c, const sint *np) -{ - struct comm *comm; - uint un = *n; - comm = tmalloc(struct comm, 1); - comm_init_check(comm, *c, *np); - cgs_unique(id, un, comm); - free(comm); -} diff --git a/3rdParty/gslib/src/gs.h b/3rdParty/gslib/src/gs.h deleted file mode 100644 index 5598e589c..000000000 --- a/3rdParty/gslib/src/gs.h +++ /dev/null @@ -1,155 +0,0 @@ -#ifndef GS_H -#define GS_H - -#if !defined(COMM_H) || !defined(GS_DEFS_H) || !defined(MEM_H) -#warning "gs.h" requires "comm.h", "gs_defs.h", and "mem.h" -#endif - -/* - Gather/Scatter Library - - The code - - struct comm c; // see "comm.h" - slong id[n]; // the slong type is defined in "types.h" - ... - struct gs_data *g = gs_setup(id,n, &c, 0,gs_auto,1); - - defines a partition of the set of (processor, local index) pairs, - (p,i) \in S_j iff abs(id[i]) == j on processor p - That is, all (p,i) pairs are grouped together (in group S_j) that have the - same id (=j). - S_0 is treated specially --- it is ignored completely - (i.e., when id[i] == 0, local index i does not participate in any - gather/scatter operation - If id[i] on proc p is negative then the pair (p,i) is "flagged". This - determines the non-symmetric behavior. For the simpler, symmetric case, - all id's should be positive. - - The second to last argument to gs_setup is the method to use, one of - gs_pairwise, gs_crystal_router, gs_all_reduce, gs_auto - The method "gs_auto" tries ~10 runs of each and chooses the fastest. - For a single-use handle, it makes more sense to use "gs_crystal_router". - - When "g" is no longer needed, free it with - - gs_free(g); - - A basic gather/scatter operation is, e.g., - - double v[n]; buffer buf; // see "mem.h" for "buffer" - ... - gs(v, gs_double,gs_add, 0, g,&buf); - - The buffer pointer can be null, in which case, a static buffer is used, - shared across all gs handles. - This gs call has the effect, (in the simple, symmetric, unflagged case) - - v[i] <-- \sum_{ (p,j) \in S_{id[i]} } v_(p) [j] - - where v_(p) [j] means v[j] on proc p. In other words, every v[i] is replaced - by the sum of all v[j]'s with the same id, given by id[i]. This accomplishes - "direct stiffness summation" corresponding to the action of QQ^T, where - "Q" is a boolean matrix that copies from a global vector (indexed by id) - to the local vectors indexed by (p,i) pairs. - - Summation on doubles is not the only operation and datatype supported. The - full list is defined in "gs_defs.h", and includes the operations - gs_add, gs_mul, gs_max, gs_min - and datatypes - gs_double, gs_float, gs_int, gs_long, gs_sint, gs_slong. - (The int and long types are the plain C types, whereas sint and slong - are defined in "types.h"). - - For the nonsymmetric behavior, the "transpose" parameter is important: - - gs(v, gs_double,gs_add, transpose, g,&buf); - - When transpose == 0, any "flagged" (p,i) pairs (id[i] negative on p) - do not participate in the sum, but *do* still receive the sum on output. - As a special case, when only one (p,i) pair is unflagged per group this - corresponds to the rectangular "Q" matrix referred to above. - - When transpose == 1, the "flagged" (p,i) pairs *do* participate in the sum, - but do *not* get set on output. In the special case of only one unflagged - (p,i) pair, this corresponds to the transpose of "Q" referred to above. - - - - A version for vectors (contiguously packed) is, e.g., - - double v[n][k]; - gs_vec(v,k, gs_double,gs_add, transpose, g,&buf); - - which is like "gs" operating on the datatype double[k], - with summation here being vector summation. Number of messages sent - is independent of k. - - For combining the communication for "gs" on multiple arrays: - - double v1[n], v2[n], ..., vk[n]; - double (*vs)[k] = {v1, v2, ..., vk}; - - gs_many(vs,k, gs_double,op, t, g,&buf); - - This call is equivalent to - - gs(v1, gs_double,op, t, g, &buf); - gs(v2, gs_double,op, t, g, &buf); - ... - gs(vk, gs_double,op, t, g, &buf); - - except that all communication is done together. - - - - Finally, gs_unique has the same basic signature as gs_setup: - - gs_unique(id,n, &c); - - This call modifies id, "flagging" (by negating id[i]) all (p,i) pairs in - each group except one. The sole "unflagged" member of the group is chosen - in an arbitrary but consistent way. If the "unique" flag is set when - calling gs_setup, the behavior is equivalent to first calling gs_unique, - except that the id array is left unmodified. - - -*/ - -#define gs PREFIXED_NAME(gs ) -#define gs_vec PREFIXED_NAME(gs_vec ) -#define gs_many PREFIXED_NAME(gs_many ) -#define igs PREFIXED_NAME(igs ) -#define igs_vec PREFIXED_NAME(igs_vec ) -#define igs_many PREFIXED_NAME(igs_many ) -#define gs_wait PREFIXED_NAME(gs_wait ) -#define gs_setup PREFIXED_NAME(gs_setup ) -#define gs_free PREFIXED_NAME(gs_free ) -#define gs_unique PREFIXED_NAME(gs_unique) -#define gs_hf2c PREFIXED_NAME(gs_hf2c ) - -struct gs_data; -typedef enum {gs_auto, gs_pairwise, gs_crystal_router, gs_all_reduce} gs_method; - -void gs(void *u, gs_dom dom, gs_op op, unsigned transpose, - struct gs_data *gsh, buffer *buf); -void gs_vec(void *u, unsigned vn, gs_dom dom, gs_op op, - unsigned transpose, struct gs_data *gsh, buffer *buf); -void gs_many(void *const*u, unsigned vn, gs_dom dom, gs_op op, - unsigned transpose, struct gs_data *gsh, buffer *buf); - -void igs(void *u, gs_dom dom, gs_op op, unsigned transpose, - struct gs_data *gsh, buffer *buf, int *handle); -void igs_vec(void *u, unsigned vn, gs_dom dom, gs_op op, - unsigned transpose, struct gs_data *gsh, buffer *buf, int *handle); -void igs_many(void *const*u, unsigned vn, gs_dom dom, gs_op op, - unsigned transpose, struct gs_data *gsh, buffer *buf, int *handle); -void gs_wait(int handle); - -struct gs_data *gs_setup(const slong *id, uint n, const struct comm *comm, - int unique, gs_method method, int verbose); -void gs_free(struct gs_data *gsh); -void gs_unique(slong *id, uint n, const struct comm *comm); -struct gs_data* gs_hf2c(const sint gsh); - -#endif diff --git a/3rdParty/gslib/src/gs_defs.h b/3rdParty/gslib/src/gs_defs.h deleted file mode 100644 index df4ad7be4..000000000 --- a/3rdParty/gslib/src/gs_defs.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef GS_DEFS_H -#define GS_DEFS_H - -/* requires: - , for GS_DEFINE_IDENTITIES() - "types.h" for gs_sint, gs_slong -*/ - -/*------------------------------------------------------------------------------ - Monoid Definitions - - Here are defined the domains and operations, each combination being a - commutative semigroup, as well as the identity element making each a - commutative monoid. -------------------------------------------------------------------------------*/ - -/* the supported domains */ -#define GS_FOR_EACH_DOMAIN(macro) \ - macro(double) \ - macro(float ) \ - macro(int ) \ - macro(long ) \ - WHEN_LONG_LONG(macro(long_long)) - -/* the supported ops */ -#define GS_FOR_EACH_OP(T,macro) \ - macro(T,add) \ - macro(T,mul) \ - macro(T,min) \ - macro(T,max) \ - macro(T,bpr) - -#define GS_DO_add(a,b) a+=b -#define GS_DO_mul(a,b) a*=b -#define GS_DO_min(a,b) if(ba) a=b -#define GS_DO_bpr(a,b) \ - do if(b!=0) { uint a_ = a; uint b_ = b; \ - if(a_==0) { a=b_; break; } \ - for(;;) { if(a_>=1; else if(b_>=1; else break; } \ - a = a_; \ - } while(0) - -/* the monoid identity elements */ -#define GS_DEFINE_MONOID_ID(T,min,max) \ - static const T gs_identity_##T[] = { 0, 1, max, min, 0 }; -#define GS_DEFINE_IDENTITIES() \ - GS_DEFINE_MONOID_ID(double, -DBL_MAX, DBL_MAX) \ - GS_DEFINE_MONOID_ID(float , -FLT_MAX, FLT_MAX) \ - GS_DEFINE_MONOID_ID(int , INT_MIN, INT_MAX) \ - GS_DEFINE_MONOID_ID(long , LONG_MIN, LONG_MAX) \ - WHEN_LONG_LONG(GS_DEFINE_MONOID_ID(long_long,LLONG_MIN,LLONG_MAX)) - -/*------------------------------------------------------------------------------ - Enums and constants -------------------------------------------------------------------------------*/ - -/* domain enum */ -#define LIST GS_FOR_EACH_DOMAIN(ITEM) gs_dom_n -#define ITEM(T) gs_##T, -typedef enum { LIST } gs_dom; -#undef ITEM -#undef LIST - -#define gs_sint TYPE_LOCAL(gs_int,gs_long,gs_long_long) -#define gs_slong TYPE_GLOBAL(gs_int,gs_long,gs_long_long) - -/* domain type size array */ -#define GS_DOM_SIZE_ITEM(T) sizeof(T), -#define GS_DEFINE_DOM_SIZES() \ - static const unsigned gs_dom_size[] = \ - { GS_FOR_EACH_DOMAIN(GS_DOM_SIZE_ITEM) 0 }; - -/* operation enum */ -#define LIST GS_FOR_EACH_OP(T,ITEM) gs_op_n -#define ITEM(T,op) gs_##op, -typedef enum { LIST } gs_op; -#undef ITEM -#undef LIST - -#endif diff --git a/3rdParty/gslib/src/gs_local.c b/3rdParty/gslib/src/gs_local.c deleted file mode 100644 index 170e94d4c..000000000 --- a/3rdParty/gslib/src/gs_local.c +++ /dev/null @@ -1,336 +0,0 @@ -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "types.h" - -#define gs_gather_array PREFIXED_NAME(gs_gather_array ) -#define gs_init_array PREFIXED_NAME(gs_init_array ) -#define gs_gather PREFIXED_NAME(gs_gather ) -#define gs_scatter PREFIXED_NAME(gs_scatter ) -#define gs_init PREFIXED_NAME(gs_init ) -#define gs_gather_vec PREFIXED_NAME(gs_gather_vec ) -#define gs_scatter_vec PREFIXED_NAME(gs_scatter_vec ) -#define gs_init_vec PREFIXED_NAME(gs_init_vec ) -#define gs_gather_many PREFIXED_NAME(gs_gather_many ) -#define gs_scatter_many PREFIXED_NAME(gs_scatter_many ) -#define gs_init_many PREFIXED_NAME(gs_init_many ) -#define gs_gather_vec_to_many PREFIXED_NAME(gs_gather_vec_to_many ) -#define gs_scatter_many_to_vec PREFIXED_NAME(gs_scatter_many_to_vec) -#define gs_scatter_vec_to_many PREFIXED_NAME(gs_scatter_vec_to_many) - -#include "gs_defs.h" -GS_DEFINE_IDENTITIES() -GS_DEFINE_DOM_SIZES() - -/*------------------------------------------------------------------------------ - The array gather kernel -------------------------------------------------------------------------------*/ -#define DEFINE_GATHER(T,OP) \ -static void gather_array_##T##_##OP( \ - T *restrict out, const T *restrict in, uint n) \ -{ \ - for(;n;--n) { T q = *in++, *p = out++; GS_DO_##OP(*p,q); } \ -} - -/*------------------------------------------------------------------------------ - The array initialization kernel -------------------------------------------------------------------------------*/ -#define DEFINE_INIT(T) \ -static void init_array_##T(T *restrict out, uint n, gs_op op) \ -{ \ - const T e = gs_identity_##T[op]; \ - for(;n;--n) *out++=e; \ -} - -#define DEFINE_PROCS(T) \ - GS_FOR_EACH_OP(T,DEFINE_GATHER) \ - DEFINE_INIT(T) - -GS_FOR_EACH_DOMAIN(DEFINE_PROCS) - -#undef DEFINE_PROCS -#undef DEFINE_INIT -#undef DEFINE_GATHER - -/*------------------------------------------------------------------------------ - The basic gather kernel -------------------------------------------------------------------------------*/ -#define DEFINE_GATHER(T,OP) \ -static void gather_##T##_##OP( \ - T *restrict out, const T *restrict in, const unsigned in_stride, \ - const uint *restrict map) \ -{ \ - uint i,j; \ - while((i=*map++)!=UINT_MAX) { \ - T t=out[i]; \ - j=*map++; do GS_DO_##OP(t,in[j*in_stride]); while((j=*map++)!=UINT_MAX); \ - out[i]=t; \ - } \ -} - -/*------------------------------------------------------------------------------ - The basic scatter kernel -------------------------------------------------------------------------------*/ -#define DEFINE_SCATTER(T) \ -static void scatter_##T( \ - T *restrict out, const unsigned out_stride, \ - const T *restrict in, const unsigned in_stride, \ - const uint *restrict map) \ -{ \ - uint i,j; \ - while((i=*map++)!=UINT_MAX) { \ - T t=in[i*in_stride]; \ - j=*map++; do out[j*out_stride]=t; while((j=*map++)!=UINT_MAX); \ - } \ -} - -/*------------------------------------------------------------------------------ - The basic initialization kernel -------------------------------------------------------------------------------*/ -#define DEFINE_INIT(T) \ -static void init_##T(T *restrict out, const uint *restrict map, gs_op op) \ -{ \ - uint i; const T e = gs_identity_##T[op]; \ - while((i=*map++)!=UINT_MAX) out[i]=e; \ -} - -#define DEFINE_PROCS(T) \ - GS_FOR_EACH_OP(T,DEFINE_GATHER) \ - DEFINE_SCATTER(T) \ - DEFINE_INIT(T) - -GS_FOR_EACH_DOMAIN(DEFINE_PROCS) - -#undef DEFINE_PROCS -#undef DEFINE_INIT -#undef DEFINE_SCATTER -#undef DEFINE_GATHER - -/*------------------------------------------------------------------------------ - The vector gather kernel -------------------------------------------------------------------------------*/ -#define DEFINE_GATHER(T,OP) \ -static void gather_vec_##T##_##OP( \ - T *restrict out, const T *restrict in, const unsigned vn, \ - const uint *restrict map) \ -{ \ - uint i,j; \ - while((i=*map++)!=UINT_MAX) { \ - T *restrict p = &out[i*vn], *pe = p+vn; \ - j=*map++; do { \ - const T *restrict q = &in[j*vn]; \ - T *restrict pk=p; do { GS_DO_##OP(*pk,*q); ++pk, ++q; } while(pk!=pe); \ - } while((j=*map++)!=UINT_MAX); \ - } \ -} - -/*------------------------------------------------------------------------------ - The vector scatter kernel -------------------------------------------------------------------------------*/ -void gs_scatter_vec( - void *restrict out, const void *restrict in, const unsigned vn, - const uint *restrict map, gs_dom dom) -{ - unsigned unit_size = vn*gs_dom_size[dom]; - uint i,j; - while((i=*map++)!=UINT_MAX) { - const char *t = (const char *)in + i*unit_size; - j=*map++; do - memcpy((char *)out+j*unit_size,t,unit_size); - while((j=*map++)!=UINT_MAX); - } -} - -/*------------------------------------------------------------------------------ - The vector initialization kernel -------------------------------------------------------------------------------*/ -#define DEFINE_INIT(T) \ -static void init_vec_##T(T *restrict out, const unsigned vn, \ - const uint *restrict map, gs_op op) \ -{ \ - uint i; const T e = gs_identity_##T[op]; \ - while((i=*map++)!=UINT_MAX) { \ - T *restrict u = (T*)out + vn*i, *ue = u+vn; \ - do *u++ = e; while(u!=ue); \ - } \ -} - -#define DEFINE_PROCS(T) \ - GS_FOR_EACH_OP(T,DEFINE_GATHER) \ - DEFINE_INIT(T) - -GS_FOR_EACH_DOMAIN(DEFINE_PROCS) - -#undef DEFINE_PROCS -#undef DEFINE_INIT -#undef DEFINE_GATHER - -#undef DO_bpr -#undef DO_max -#undef DO_min -#undef DO_mul -#undef DO_add - -#define SWITCH_DOMAIN_CASE(T) case gs_##T: WITH_DOMAIN(T); break; -#define SWITCH_DOMAIN(dom) do switch(dom) { \ - GS_FOR_EACH_DOMAIN(SWITCH_DOMAIN_CASE) case gs_dom_n: break; } while(0) - -#define SWITCH_OP_CASE(T,OP) case gs_##OP: WITH_OP(T,OP); break; -#define SWITCH_OP(T,op) do switch(op) { \ - GS_FOR_EACH_OP(T,SWITCH_OP_CASE) case gs_op_n: break; } while(0) - -/*------------------------------------------------------------------------------ - Array kernels -------------------------------------------------------------------------------*/ -void gs_gather_array(void *out, const void *in, uint n, gs_dom dom, gs_op op) -{ -#define WITH_OP(T,OP) gather_array_##T##_##OP(out,in,n) -#define WITH_DOMAIN(T) SWITCH_OP(T,op) - SWITCH_DOMAIN(dom); -#undef WITH_DOMAIN -#undef WITH_OP -} - -void gs_init_array(void *out, uint n, gs_dom dom, gs_op op) -{ -#define WITH_DOMAIN(T) init_array_##T(out,n,op) - SWITCH_DOMAIN(dom); -#undef WITH_DOMAIN -} - -/*------------------------------------------------------------------------------ - Plain kernels; vn parameter ignored but present for consistent signatures -------------------------------------------------------------------------------*/ -void gs_gather(void *out, const void *in, const unsigned vn, - const uint *map, gs_dom dom, gs_op op) -{ -#define WITH_OP(T,OP) gather_##T##_##OP(out,in,1,map) -#define WITH_DOMAIN(T) SWITCH_OP(T,op) - SWITCH_DOMAIN(dom); -#undef WITH_DOMAIN -#undef WITH_OP -} - -void gs_scatter(void *out, const void *in, const unsigned vn, - const uint *map, gs_dom dom) -{ -#define WITH_DOMAIN(T) scatter_##T(out,1,in,1,map) - SWITCH_DOMAIN(dom); -#undef WITH_DOMAIN -} - -void gs_init(void *out, const unsigned vn, const uint *map, - gs_dom dom, gs_op op) -{ -#define WITH_DOMAIN(T) init_##T(out,map,op) - SWITCH_DOMAIN(dom); -#undef WITH_DOMAIN -} - -/*------------------------------------------------------------------------------ - Vector kernels -------------------------------------------------------------------------------*/ -void gs_gather_vec(void *out, const void *in, const unsigned vn, - const uint *map, gs_dom dom, gs_op op) -{ -#define WITH_OP(T,OP) gather_vec_##T##_##OP(out,in,vn,map) -#define WITH_DOMAIN(T) SWITCH_OP(T,op) - SWITCH_DOMAIN(dom); -#undef WITH_DOMAIN -#undef WITH_OP -} - -void gs_init_vec(void *out, const unsigned vn, const uint *map, - gs_dom dom, gs_op op) -{ -#define WITH_DOMAIN(T) init_vec_##T(out,vn,map,op) - SWITCH_DOMAIN(dom); -#undef WITH_DOMAIN -} - -/*------------------------------------------------------------------------------ - Multiple array kernels -------------------------------------------------------------------------------*/ -void gs_gather_many(void *out, const void *in, const unsigned vn, - const uint *map, gs_dom dom, gs_op op) -{ - uint k; - typedef void *ptr_to_void; typedef const void *ptr_to_const_void; - const ptr_to_void *p = out; const ptr_to_const_void *q = in; -#define WITH_OP(T,OP) for(k=0;k multiple arrays - Scatter from multiple arrays -> strided array, - Scatter from strided array -> multiple arrays, -------------------------------------------------------------------------------*/ -void gs_gather_vec_to_many(void *out, const void *in, const unsigned vn, - const uint *map, gs_dom dom, gs_op op) -{ - unsigned i; const unsigned unit_size = gs_dom_size[dom]; - typedef void *ptr_to_void; - const ptr_to_void *p = out; const char *q = in; -#define WITH_OP(T,OP) \ - for(i=vn;i;--i) gather_##T##_##OP(*p++,(const T*)q,vn,map), q+=unit_size -#define WITH_DOMAIN(T) SWITCH_OP(T,op) - SWITCH_DOMAIN(dom); -#undef WITH_DOMAIN -#undef WITH_OP -} - -void gs_scatter_many_to_vec(void *out, const void *in, const unsigned vn, - const uint *map, gs_dom dom) -{ - unsigned i; const unsigned unit_size = gs_dom_size[dom]; - typedef const void *ptr_to_const_void; - char *p = out; const ptr_to_const_void *q = in; -#define WITH_DOMAIN(T) \ - for(i=vn;i;--i) scatter_##T((T*)p,vn,*q++,1,map), p+=unit_size - SWITCH_DOMAIN(dom); -#undef WITH_DOMAIN -} - -void gs_scatter_vec_to_many(void *out, const void *in, const unsigned vn, - const uint *map, gs_dom dom) -{ - unsigned i; const unsigned unit_size = gs_dom_size[dom]; - typedef void *ptr_to_void; - const ptr_to_void *p = out; const char *q = in; -#define WITH_DOMAIN(T) \ - for(i=vn;i;--i) scatter_##T(*p++,1,(const T*)q,vn,map), q+=unit_size - SWITCH_DOMAIN(dom); -#undef WITH_DOMAIN -} - -#undef SWITCH_OP -#undef SWITCH_OP_CASE -#undef SWITCH_DOMAIN -#undef SWITCH_DOMAIN_CASE diff --git a/3rdParty/gslib/src/gs_local.h b/3rdParty/gslib/src/gs_local.h deleted file mode 100644 index fc7c41499..000000000 --- a/3rdParty/gslib/src/gs_local.h +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef GS_LOCAL_H -#define GS_LOCAL_H - -#if !defined(NAME_H) || !defined(TYPES_H) || !defined(GS_DEFS_H) -#warning "gs_local.h" requires "name.h", "types.h", and "gs_defs.h" -#endif - -#define gs_gather_array PREFIXED_NAME(gs_gather_array ) -#define gs_init_array PREFIXED_NAME(gs_init_array ) -#define gs_gather PREFIXED_NAME(gs_gather ) -#define gs_scatter PREFIXED_NAME(gs_scatter ) -#define gs_init PREFIXED_NAME(gs_init ) -#define gs_gather_vec PREFIXED_NAME(gs_gather_vec ) -#define gs_scatter_vec PREFIXED_NAME(gs_scatter_vec ) -#define gs_init_vec PREFIXED_NAME(gs_init_vec ) -#define gs_gather_many PREFIXED_NAME(gs_gather_many ) -#define gs_scatter_many PREFIXED_NAME(gs_scatter_many ) -#define gs_init_many PREFIXED_NAME(gs_init_many ) -#define gs_gather_vec_to_many PREFIXED_NAME(gs_gather_vec_to_many ) -#define gs_scatter_many_to_vec PREFIXED_NAME(gs_scatter_many_to_vec) -#define gs_scatter_vec_to_many PREFIXED_NAME(gs_scatter_vec_to_many) - -void gs_gather_array(void *out, const void *in, uint n, - gs_dom dom, gs_op op); -void gs_init_array(void *out, uint n, gs_dom dom, gs_op op); - -typedef void gs_gather_fun( - void *out, const void *in, const unsigned vn, - const uint *map, gs_dom dom, gs_op op); -typedef void gs_scatter_fun( - void *out, const void *in, const unsigned vn, - const uint *map, gs_dom dom); -typedef void gs_init_fun( - void *out, const unsigned vn, - const uint *map, gs_dom dom, gs_op op); - -extern gs_gather_fun gs_gather, gs_gather_vec, gs_gather_many, - gs_gather_vec_to_many; -extern gs_scatter_fun gs_scatter, gs_scatter_vec, gs_scatter_many, - gs_scatter_many_to_vec, gs_scatter_vec_to_many; -extern gs_init_fun gs_init, gs_init_vec, gs_init_many; - -#endif diff --git a/3rdParty/gslib/src/gslib.h b/3rdParty/gslib/src/gslib.h deleted file mode 100644 index 2b1956838..000000000 --- a/3rdParty/gslib/src/gslib.h +++ /dev/null @@ -1,20 +0,0 @@ -#define UNDERSCORE 1 -#define USE_NAIVE_BLAS -#define NO_NEX_EXITT 1 -#define GLOBAL_LONG_LONG 1 - -#define MPI 1 - -#include -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#include "comm.h" -#include "mem.h" -#include "gs_defs.h" -#include "gs.h" -#include "crs.h" diff --git a/3rdParty/gslib/src/lob_bnd.c b/3rdParty/gslib/src/lob_bnd.c deleted file mode 100644 index d81a9a063..000000000 --- a/3rdParty/gslib/src/lob_bnd.c +++ /dev/null @@ -1,285 +0,0 @@ -#include -#include -#include -#include /* for cos, fabs */ -#include -#include "c99.h" -#include "name.h" -#include "types.h" -#include "fail.h" -#include "mem.h" -#include "poly.h" - -#define lob_bnd_setup PREFIXED_NAME(lob_bnd_setup) -#define lob_bnd_lin_1 PREFIXED_NAME(lob_bnd_lin_1) -#define lob_bnd_lin_2 PREFIXED_NAME(lob_bnd_lin_2) -#define lob_bnd_lin_3 PREFIXED_NAME(lob_bnd_lin_3) -#define lob_bnd_1 PREFIXED_NAME(lob_bnd_1 ) -#define lob_bnd_2 PREFIXED_NAME(lob_bnd_2 ) -#define lob_bnd_3 PREFIXED_NAME(lob_bnd_3 ) - -struct dbl_range { double min,max; }; - -/*-------------------------------------------------------------------------- - Bounds for Polynomials on [-1,1]^d - given in the Lagrangian basis on - Gauss-Lobatto-Legendre quadrature nodes - - The main parameters are the number of GLL nodes in each dimension - unsigned nr = ..., ns = ..., nt = ...; - - The number of points in the constructed piecewise (tri-,bi-)linear bounds - is a parameter; more points give tighter bounds, and we expect m>n. - - unsigned mr = 4*nr, ms = 4*ns, mt = 4*nt; - - The necessary setup is accomplished via: - double *data_r = tmalloc(double, lob_bnd_size(nr,mr)); - double *data_s = tmalloc(double, lob_bnd_size(ns,ms)); - double *data_t = tmalloc(double, lob_bnd_size(nt,mt)); - lob_bnd_setup(data_r, nr,mr); - lob_bnd_setup(data_s, ns,ms); - lob_bnd_setup(data_t, nt,mt); - - Bounds may then be computed via: - double work1r[2*mr], work1s[2*ms]; - double work2[2*mr*(ns+ms+1)]; - double work3[2*mr*ms*(nt+mt+1)]; - double ur[nr], us[ns]; // 1-d polynomials on the zr[] and zs[] nodes - double u2[ns][nr]; // 2-d polynomial on zr[] (x) zs[] - double u3[nt][ns][nr]; // 3-d polynomial on zr[] (x) zs[] (x) zt[] - struct dbl_range bound; - - bound = lob_bnd_1(data_r,nr,mr, ur, work1r); // compute bounds on ur - bound = lob_bnd_1(data_s,ns,ms, us, work1s); // compute bounds on us - bound = lob_bnd_2(data_r,nr,mr, data_s,ns,ms, - (const double*)&u2[0][0], work2); // compute bounds on u2 - bound = lob_bnd_3(data_r,nr,mr, data_s,ns,ms, data_t,nt,mt, - (const double*)&u3[0][0], work3); // compute bounds on u3 - - free(data_r), free(data_s), free(data_t); - - The functions lob_bnd_lin_d compute the piecewise d-linear bounds. - Nodes for these are Chebyshev-Lobatto: - h[0] = -1, h[m-1] = 1; - for(j=1;j=0) - for(j=0;j=0) /* 0 <= w0 <= w1 */ - for(k=0;kbnd.max?b[1]:bnd.max; - return bnd; -} - -/* work holds 2*m doubles */ -struct dbl_range lob_bnd_1( - const double *restrict lob_bnd_data, unsigned n, unsigned m, - const double *restrict u, double *restrict work) -{ - lob_bnd_lin_1(work, lob_bnd_data,n,m, u,1); - return minmax(work,m); -} - -/* work holds 2*mr*ms + 2*mr + 2*mr*ns - =2*mr*(ms+1+ns) doubles */ -struct dbl_range lob_bnd_2( - const double *lob_bnd_data_r, unsigned nr, unsigned mr, - const double *lob_bnd_data_s, unsigned ns, unsigned ms, - const double *restrict u, double *restrict work) -{ - unsigned m = mr*ms; - lob_bnd_lin_2(work, lob_bnd_data_r,nr,mr, - lob_bnd_data_s,ns,ms, u,1, work+2*m); - return minmax(work,m); -} - -/* work holds 2*mr*ms*mt + 2*mr*ms + 2*nt*ms*mr - =2*mr*ms*(nt+mt+1) doubles */ -struct dbl_range lob_bnd_3( - const double *lob_bnd_data_r, unsigned nr, unsigned mr, - const double *lob_bnd_data_s, unsigned ns, unsigned ms, - const double *lob_bnd_data_t, unsigned nt, unsigned mt, - const double *restrict u, double *restrict work) -{ - unsigned m = mr*ms*mt; - lob_bnd_lin_3(work, lob_bnd_data_r,nr,mr, - lob_bnd_data_s,ns,ms, - lob_bnd_data_t,nt,mt, u,1, work+2*m); - return minmax(work,m); -} diff --git a/3rdParty/gslib/src/lob_bnd.h b/3rdParty/gslib/src/lob_bnd.h deleted file mode 100644 index b47256b3a..000000000 --- a/3rdParty/gslib/src/lob_bnd.h +++ /dev/null @@ -1,111 +0,0 @@ -#ifndef LOB_BND_H -#define LOB_BND_H - -#if !defined(TYPES_H) || !defined(NAME_H) -#warning "lob_bnd.h" requires "types.h" and "name.h" -#endif - -#define lob_bnd_setup PREFIXED_NAME(lob_bnd_setup) -#define lob_bnd_lin_1 PREFIXED_NAME(lob_bnd_lin_1) -#define lob_bnd_lin_2 PREFIXED_NAME(lob_bnd_lin_2) -#define lob_bnd_lin_3 PREFIXED_NAME(lob_bnd_lin_3) -#define lob_bnd_1 PREFIXED_NAME(lob_bnd_1 ) -#define lob_bnd_2 PREFIXED_NAME(lob_bnd_2 ) -#define lob_bnd_3 PREFIXED_NAME(lob_bnd_3 ) - -/*-------------------------------------------------------------------------- - Bounds for Polynomials on [-1,1]^d - given in the Lagrangian basis on - Gauss-Lobatto-Legendre quadrature nodes - - The main parameters are the number of GLL nodes in each dimension - unsigned nr = ..., ns = ..., nt = ...; - - The number of points in the constructed piecewise (tri-,bi-)linear bounds - is a parameter; more points give tighter bounds, and we expect m>n. - - unsigned mr = 4*nr, ms = 4*ns, mt = 4*nt; - - The necessary setup is accomplished via: - double *data_r = tmalloc(double, lob_bnd_size(nr,mr)); - double *data_s = tmalloc(double, lob_bnd_size(ns,ms)); - double *data_t = tmalloc(double, lob_bnd_size(nt,mt)); - lob_bnd_setup(data_r, nr,mr); - lob_bnd_setup(data_s, ns,ms); - lob_bnd_setup(data_t, nt,mt); - - Bounds may then be computed via: - double work1r[2*mr], work1s[2*ms]; - double work2[2*mr*(ns+ms+1)]; - double work3[2*mr*ms*(nt+mt+1)]; - double ur[nr], us[ns]; // 1-d polynomials on the zr[] and zs[] nodes - double u2[ns][nr]; // 2-d polynomial on zr[] (x) zs[] - double u3[nt][ns][nr]; // 3-d polynomial on zr[] (x) zs[] (x) zt[] - struct dbl_range bound; - - bound = lob_bnd_1(data_r,nr,mr, ur, work1r); // compute bounds on ur - bound = lob_bnd_1(data_s,ns,ms, us, work1s); // compute bounds on us - bound = lob_bnd_2(data_r,nr,mr, data_s,ns,ms, - (const double*)&u2[0][0], work2); // compute bounds on u2 - bound = lob_bnd_3(data_r,nr,mr, data_s,ns,ms, data_t,nt,mt, - (const double*)&u3[0][0], work3); // compute bounds on u3 - - free(data_r), free(data_s), free(data_t); - - The functions lob_bnd_lin_d compute the piecewise d-linear bounds. - Nodes for these are Chebyshev-Lobatto: - h[0] = -1, h[m-1] = 1; - for(j=1;j for size_t, offsetof - for malloc, calloc, realloc, free - for memcpy - "c99.h" - "fail.h" -*/ - -#if !defined(C99_H) || !defined(FAIL_H) -#error "mem.h" requires "c99.h" and "fail.h" -#endif - -/* - All memory management goes through the wrappers defined in this - header. Diagnostics can be turned on with - -DPRINT_MALLOCS=1 - Then all memory management operations will be printed to stdout. - - Most memory management occurs through use of the "array" type, - defined below, which defines a generic dynamically-sized array - that grows in bursts. The "buffer" type is a "char" array and - is often passed around by code to provide a common area for - scratch work. -*/ - -#ifndef PRINT_MALLOCS -# define PRINT_MALLOCS 0 -#else -# include -# ifndef comm_gbl_id -# define comm_gbl_id PREFIXED_NAME(comm_gbl_id) -# define comm_gbl_np PREFIXED_NAME(comm_gbl_np) -# include "types.h" - extern uint comm_gbl_id, comm_gbl_np; -# endif -#endif - -/*-------------------------------------------------------------------------- - Memory Allocation Wrappers to Catch Out-of-memory - --------------------------------------------------------------------------*/ - -static inline void *smalloc(size_t size, const char *file, unsigned line) -{ - void *restrict res = malloc(size); - #if PRINT_MALLOCS - fprintf(stdout,"MEM: proc %04d: %p = malloc(%ld) @ %s(%u)\n", - (int)comm_gbl_id,res,(long)size,file,line), fflush(stdout); - #endif - if(!res && size) - fail(1,file,line,"allocation of %ld bytes failed\n",(long)size); - return res; -} - -static inline void *scalloc( - size_t nmemb, size_t size, const char *file, unsigned line) -{ - void *restrict res = calloc(nmemb, size); - #if PRINT_MALLOCS - fprintf(stdout,"MEM: proc %04d: %p = calloc(%ld) @ %s(%u)\n", - (int)comm_gbl_id,res,(long)size*nmemb,file,line), fflush(stdout); - #endif - if(!res && nmemb) - fail(1,file,line,"allocation of %ld bytes failed\n", - (long)size*nmemb); - return res; -} - -static inline void *srealloc( - void *restrict ptr, size_t size, const char *file, unsigned line) -{ - void *restrict res = realloc(ptr, size); - #if PRINT_MALLOCS - if(res!=ptr) { - if(ptr) - fprintf(stdout,"MEM: proc %04d: %p freed by realloc @ %s(%u)\n", - (int)comm_gbl_id,ptr,file,line), fflush(stdout); - fprintf(stdout,"MEM: proc %04d: %p = realloc of %p to %lu @ %s(%u)\n", - (int)comm_gbl_id,res,ptr,(long)size,file,line), fflush(stdout); - } else - fprintf(stdout,"MEM: proc %04d: %p realloc'd to %lu @ %s(%u)\n", - (int)comm_gbl_id,res,(long)size,file,line), fflush(stdout); - #endif - if(!res && size) - fail(1,file,line,"allocation of %ld bytes failed\n",(long)size); - return res; -} - -#define tmalloc(type, count) \ - ((type*) smalloc((count)*sizeof(type),__FILE__,__LINE__) ) -#define tcalloc(type, count) \ - ((type*) scalloc((count),sizeof(type),__FILE__,__LINE__) ) -#define trealloc(type, ptr, count) \ - ((type*) srealloc((ptr),(count)*sizeof(type),__FILE__,__LINE__) ) - -#if PRINT_MALLOCS -static inline void sfree(void *restrict ptr, const char *file, unsigned line) -{ - free(ptr); - fprintf(stdout,"MEM: proc %04d: %p freed @ %s(%u)\n", - (int)comm_gbl_id,ptr,file,line), fflush(stdout); -} -#define free(x) sfree(x,__FILE__,__LINE__) -#endif - -/*-------------------------------------------------------------------------- - A dynamic array - --------------------------------------------------------------------------*/ -struct array { void *ptr; size_t n,max; }; -#define null_array {0,0,0} -static void array_init_(struct array *a, size_t max, size_t size, - const char *file, unsigned line) -{ - a->n=0, a->max=max, a->ptr=smalloc(max*size,file,line); -} -static void array_resize_(struct array *a, size_t max, size_t size, - const char *file, unsigned line) -{ - a->max=max, a->ptr=srealloc(a->ptr,max*size,file,line); -} -static void *array_reserve_(struct array *a, size_t min, size_t size, - const char *file, unsigned line) -{ - size_t max = a->max; - if(maxptr; -} - -#define array_free(a) (free((a)->ptr)) -#define array_init(T,a,max) array_init_(a,max,sizeof(T),__FILE__,__LINE__) -#define array_resize(T,a,max) array_resize_(a,max,sizeof(T),__FILE__,__LINE__) -#define array_reserve(T,a,min) array_reserve_(a,min,sizeof(T),__FILE__,__LINE__) - -static void array_cat_(size_t size, struct array *d, const void *s, size_t n, - const char *file, unsigned line) -{ - void *out = array_reserve_(d,d->n+n,size, file,line); - memcpy((char*)out+d->n*size, s, n*size); - d->n+=n; -} - -#define array_cat(T,d,s,n) array_cat_(sizeof(T),d,s,n,__FILE__,__LINE__) - -/*-------------------------------------------------------------------------- - Buffer = char array - --------------------------------------------------------------------------*/ -typedef struct array buffer; -#define null_buffer null_array -#define buffer_init(b,max) array_init(char,b,max) -#define buffer_resize(b,max) array_resize(char,b,max) -#define buffer_reserve(b,max) array_reserve(char,b,max) -#define buffer_free(b) array_free(b) - -/*-------------------------------------------------------------------------- - Alignment routines - --------------------------------------------------------------------------*/ -#define ALIGNOF(T) offsetof(struct { char c; T x; }, x) -static size_t align_as_(size_t a, size_t n) { return (n+a-1)/a*a; } -#define align_as(T,n) align_as_(ALIGNOF(T),n) -#define align_ptr(T,base,offset) ((T*)((char*)(base)+align_as(T,offset))) -#endif - diff --git a/3rdParty/gslib/src/name.h b/3rdParty/gslib/src/name.h deleted file mode 100644 index b4bcd9169..000000000 --- a/3rdParty/gslib/src/name.h +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef NAME_H -#define NAME_H - -/* establishes some macros to establish - * the FORTRAN naming convention - default gs_setup, etc. - -DUPCASE GS_SETUP, etc. - -DUNDERSCORE gs_setup_, etc. - * a prefix for all external (non-FORTRAN) function names - for example, -DPREFIX=jl_ transforms fail -> jl_fail - * a prefix for all external FORTRAN function names - for example, -DFPREFIX=jlf_ transforms gs_setup_ -> jlf_gs_setup_ -*/ - -/* the following macro functions like a##b, - but will expand a and/or b if they are themselves macros */ -#define TOKEN_PASTE_(a,b) a##b -#define TOKEN_PASTE(a,b) TOKEN_PASTE_(a,b) - -#ifdef PREFIX -# define PREFIXED_NAME(x) TOKEN_PASTE(PREFIX,x) -#else -# define PREFIXED_NAME(x) x -#endif - -#ifdef FPREFIX -# define FPREFIXED_NAME(x) TOKEN_PASTE(FPREFIX,x) -#else -# define FPREFIXED_NAME(x) x -#endif - -#if defined(UPCASE) -# define FORTRAN_NAME(low,up) FPREFIXED_NAME(up) -# define FORTRAN_UNPREFIXED(low,up) up -#elif defined(UNDERSCORE) -# define FORTRAN_NAME(low,up) FPREFIXED_NAME(TOKEN_PASTE(low,_)) -# define FORTRAN_UNPREFIXED(low,up) TOKEN_PASTE(low,_) -#else -# define FORTRAN_NAME(low,up) FPREFIXED_NAME(low) -# define FORTRAN_UNPREFIXED(low,up) low -#endif - -#endif - diff --git a/3rdParty/gslib/src/obbox.c b/3rdParty/gslib/src/obbox.c deleted file mode 100644 index 22c4614f3..000000000 --- a/3rdParty/gslib/src/obbox.c +++ /dev/null @@ -1,341 +0,0 @@ -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#include "mem.h" -#include "tensor.h" -#include "poly.h" -#include "lob_bnd.h" - -#define obbox_calc_2 PREFIXED_NAME(obbox_calc_2) -#define obbox_calc_3 PREFIXED_NAME(obbox_calc_3) - -struct obbox_2 { double c0[2], A[4]; - struct dbl_range x[2]; }; - -struct obbox_3 { double c0[3], A[9]; - struct dbl_range x[3]; }; - - -static void copy_strided(double *out, const double *in, - unsigned g, unsigned s, unsigned n) -{ - if(g==1) for(;n;--n,in+=s) *out++ = *in; - else { - s *= g; - for(;n;--n,in+=s) memcpy(out,in,g*sizeof(double)), out+=g; - } -} - -static void mat_inv_2(double inv[4], const double A[4]) -{ - const double idet = 1/(A[0]*A[3]-A[1]*A[2]); - inv[0] = idet*A[3]; - inv[1] = -(idet*A[1]); - inv[2] = -(idet*A[2]); - inv[3] = idet*A[0]; -} - -static void mat_inv_3(double inv[9], const double A[9]) -{ - const double a = A[4]*A[8]-A[5]*A[7], - b = A[5]*A[6]-A[3]*A[8], - c = A[3]*A[7]-A[4]*A[6], - idet = 1/(A[0]*a+A[1]*b+A[2]*c); - inv[0] = idet*a; - inv[1] = idet*(A[2]*A[7]-A[1]*A[8]); - inv[2] = idet*(A[1]*A[5]-A[2]*A[4]); - inv[3] = idet*b; - inv[4] = idet*(A[0]*A[8]-A[2]*A[6]); - inv[5] = idet*(A[2]*A[3]-A[0]*A[5]); - inv[6] = idet*c; - inv[7] = idet*(A[1]*A[6]-A[0]*A[7]); - inv[8] = idet*(A[0]*A[4]-A[1]*A[3]); -} - -static struct dbl_range dbl_range_merge(struct dbl_range a, struct dbl_range b) -{ - struct dbl_range m; - m.min = b.minb.max?a.max:b.max; - return m; -} - -static struct dbl_range dbl_range_expand(struct dbl_range b, double tol) -{ - double a = (b.min+b.max)/2, l = (b.max-b.min)*(1+tol)/2; - struct dbl_range m; - m.min = a-l, m.max = a+l; - return m; -} - -static void bbox_2_tfm(double *out, const double x0[2], const double Ji[4], - const double *x, const double *y, unsigned n) -{ - unsigned i; - for(i=0;ix[0].min)*(b->x[0].max-x); - return bx<0 ? bx : (y-b->x[1].min)*(b->x[1].max-y); -} - -/* positive when possibly inside */ -double obbox_test_2(const struct obbox_2 *const b, - const double x, const double y) -{ - const double bxy = obbox_axis_test_2(b,x,y); - if(bxy<0) return bxy; else { - const double dx = x-b->c0[0], dy = y-b->c0[1]; - const double r = b->A[0]*dx + b->A[1]*dy, - s = b->A[2]*dx + b->A[3]*dy; - const double br = (r+1)*(1-r); - return br<0 ? br : (s+1)*(1-s); - } -} - -#endif - -#define DO_MAX(a,b) do { unsigned temp = b; if(temp>a) a=temp; } while(0) - -void obbox_calc_2(struct obbox_2 *out, - const double *const elx[2], - const unsigned n[2], uint nel, - const unsigned m[2], const double tol) -{ - const double *x = elx[0], *y = elx[1]; - const unsigned nr = n[0], ns = n[1]; - const unsigned mr = m[0], ms = m[1]; - - const unsigned nrs = nr*ns; - double *data; - const unsigned lbsize0 = lob_bnd_size(nr,mr), - lbsize1 = lob_bnd_size(ns,ms); - unsigned wsize = 4*ns+2*ms; - DO_MAX(wsize,2*nr+2*mr); - DO_MAX(wsize,gll_lag_size(nr)); - DO_MAX(wsize,gll_lag_size(ns)); - data = tmalloc(double, 2*(nr+ns)+lbsize0+lbsize1+wsize); - - { - double *const I0r = data, *const I0s = data+2*nr; - double *const lob_bnd_data_r = data+2*(nr+ns), - *const lob_bnd_data_s = data+2*(nr+ns)+lbsize0; - double *const work = data+2*(nr+ns)+lbsize0+lbsize1; - - #define SETUP_DIR(r) do { \ - lagrange_fun *const lag = gll_lag_setup(work, n##r); \ - lag(I0##r, work,n##r,1, 0); \ - lob_bnd_setup(lob_bnd_data_##r, n##r,m##r); \ - } while(0) - - SETUP_DIR(r); SETUP_DIR(s); - - #undef SETUP_DIR - - for(;nel;--nel,x+=nrs,y+=nrs,++out) { - double x0[2], J[4], Ji[4]; - struct dbl_range ab[2], tb[2]; - - /* double work[2*nr] */ - x0[0] = tensor_ig2(J , I0r,nr, I0s,ns, x, work); - x0[1] = tensor_ig2(J+2, I0r,nr, I0s,ns, y, work); - mat_inv_2(Ji, J); - - /* double work[2*m##r] */ - #define DO_BOUND(bnd,merge,r,x,work) do { \ - struct dbl_range b = \ - lob_bnd_1(lob_bnd_data_##r,n##r,m##r, x, work); \ - if(merge) bnd=dbl_range_merge(bnd,b); else bnd=b; \ - } while(0) - - /* double work[2*n##r + 2*m##r] */ - #define DO_EDGE(merge,r,x,y,work) do { \ - DO_BOUND(ab[0],merge,r,x,work); \ - DO_BOUND(ab[1],merge,r,y,work); \ - bbox_2_tfm(work, x0,Ji, x,y,n##r); \ - DO_BOUND(tb[0],merge,r,(work) ,(work)+2*n##r); \ - DO_BOUND(tb[1],merge,r,(work)+n##r,(work)+2*n##r); \ - } while(0) - - DO_EDGE(0,r,x,y,work); - DO_EDGE(1,r,&x[nrs-nr],&y[nrs-nr],work); - - /* double work[4*ns + 2*ms] */ - #define GET_EDGE(off) do { \ - copy_strided(work , x+off,1,nr,ns); \ - copy_strided(work+ns, y+off,1,nr,ns); \ - DO_EDGE(1,s,work,work+ns,work+2*ns); \ - } while(0) - - GET_EDGE(0); - GET_EDGE(nr-1); - - #undef GET_EDGE - #undef DO_EDGE - #undef DO_BOUND - - out->x[0] = dbl_range_expand(ab[0],tol), - out->x[1] = dbl_range_expand(ab[1],tol); - - { - const double av0=(tb[0].min+tb[0].max)/2, av1=(tb[1].min+tb[1].max)/2; - out->c0[0] = x0[0] + J[0]*av0 + J[1]*av1; - out->c0[1] = x0[1] + J[2]*av0 + J[3]*av1; - } - { - const double di0 = 2/((1+tol)*(tb[0].max-tb[0].min)), - di1 = 2/((1+tol)*(tb[1].max-tb[1].min)); - out->A[0]=di0*Ji[0], out->A[1]=di0*Ji[1]; - out->A[2]=di1*Ji[2], out->A[3]=di1*Ji[3]; - } - - } - } - - free(data); -} - -void obbox_calc_3(struct obbox_3 *out, - const double *const elx[3], - const unsigned n[3], uint nel, - const unsigned m[3], const double tol) -{ - const double *x = elx[0], *y = elx[1], *z = elx[2]; - const unsigned nr = n[0], ns = n[1], nt = n[2]; - const unsigned mr = m[0], ms = m[1], mt = m[2]; - - const unsigned nrs = nr*ns, nrst = nr*ns*nt; - double *data; - const unsigned lbsize0 = lob_bnd_size(nr,mr), - lbsize1 = lob_bnd_size(ns,ms), - lbsize2 = lob_bnd_size(nt,mt); - unsigned wsize = 3*nr*ns+2*mr*(ns+ms+1); - DO_MAX(wsize,6*nr*nt+2*mr*(nt+mt+1)); - DO_MAX(wsize,6*ns*nt+2*ms*(nt+mt+1)); - DO_MAX(wsize,2*nr*ns+3*nr); - DO_MAX(wsize,gll_lag_size(nr)); - DO_MAX(wsize,gll_lag_size(ns)); - DO_MAX(wsize,gll_lag_size(nt)); - data = tmalloc(double, 2*(nr+ns+nt)+lbsize0+lbsize1+lbsize2+wsize); - - { - double *const I0r = data, *const I0s = I0r+2*nr, *const I0t = I0s+2*ns; - double *const lob_bnd_data_r = data+2*(nr+ns+nt), - *const lob_bnd_data_s = data+2*(nr+ns+nt)+lbsize0, - *const lob_bnd_data_t = data+2*(nr+ns+nt)+lbsize0+lbsize1; - double *const work = data+2*(nr+ns+nt)+lbsize0+lbsize1+lbsize2; - - #define SETUP_DIR(r) do { \ - lagrange_fun *const lag = gll_lag_setup(work, n##r); \ - lag(I0##r, work,n##r,1, 0); \ - lob_bnd_setup(lob_bnd_data_##r, n##r,m##r); \ - } while(0) - - SETUP_DIR(r); SETUP_DIR(s); SETUP_DIR(t); - - #undef SETUP_DIR - - for(;nel;--nel,x+=nrst,y+=nrst,z+=nrst,++out) { - double x0[3], J[9], Ji[9]; - struct dbl_range ab[3], tb[3]; - - /* double work[2*nrs+3*nr] */ - #define EVAL_AT_0(d,x) \ - x0[d] = tensor_ig3(J+3*d, I0r,nr, I0s,ns, I0t,nt, x, work) - EVAL_AT_0(0,x); EVAL_AT_0(1,y); EVAL_AT_0(2,z); - mat_inv_3(Ji, J); - #undef EVAL_AT_0 - - /* double work[2*m##r*(n##s+m##s+1)] */ - #define DO_BOUND(bnd,merge,r,s,x,work) do { \ - struct dbl_range b = \ - lob_bnd_2(lob_bnd_data_##r,n##r,m##r, \ - lob_bnd_data_##s,n##s,m##s, x, work); \ - if(merge) bnd=dbl_range_merge(bnd,b); else bnd=b; \ - } while(0) - - /* double work[3*n##r*n##s+2*m##r*(n##s+m##s+1)] */ - #define DO_FACE(merge,r,s,x,y,z,work) do { \ - DO_BOUND(ab[0],merge,r,s,x,work); \ - DO_BOUND(ab[1],merge,r,s,y,work); \ - DO_BOUND(ab[2],merge,r,s,z,work); \ - bbox_3_tfm(work, x0,Ji, x,y,z,n##r*n##s); \ - DO_BOUND(tb[0],merge,r,s,(work) ,(work)+3*n##r*n##s); \ - DO_BOUND(tb[1],merge,r,s,(work)+ n##r*n##s,(work)+3*n##r*n##s); \ - DO_BOUND(tb[2],merge,r,s,(work)+2*n##r*n##s,(work)+3*n##r*n##s); \ - } while(0) - - DO_FACE(0,r,s,x,y,z,work); - DO_FACE(1,r,s,&x[nrst-nrs],&y[nrst-nrs],&z[nrst-nrs],work); - - /* double work[6*n##r*n##s+2*m##r*(n##s+m##s+1)] */ - #define GET_FACE(r,s,off,n1,n2,n3) do { \ - copy_strided(work , x+off,n1,n2,n3); \ - copy_strided(work+ n##r*n##s, y+off,n1,n2,n3); \ - copy_strided(work+2*n##r*n##s, z+off,n1,n2,n3); \ - DO_FACE(1,r,s,work,work+n##r*n##s,work+2*n##r*n##s,work+3*n##r*n##s); \ - } while(0) - - GET_FACE(r,t,0 ,nr,ns,nt); - GET_FACE(r,t,nrs-nr,nr,ns,nt); - GET_FACE(s,t,0 , 1,nr,ns*nt); - GET_FACE(s,t,nr-1 , 1,nr,ns*nt); - - #undef GET_FACE - #undef DO_FACE - #undef DO_BOUND - - out->x[0] = dbl_range_expand(ab[0],tol), - out->x[1] = dbl_range_expand(ab[1],tol); - out->x[2] = dbl_range_expand(ab[2],tol); - - { - const double av0 = (tb[0].min+tb[0].max)/2, - av1 = (tb[1].min+tb[1].max)/2, - av2 = (tb[2].min+tb[2].max)/2; - out->c0[0] = x0[0] + J[0]*av0 + J[1]*av1 + J[2]*av2; - out->c0[1] = x0[1] + J[3]*av0 + J[4]*av1 + J[5]*av2; - out->c0[2] = x0[2] + J[6]*av0 + J[7]*av1 + J[8]*av2; - } - { - const double di0 = 2/((1+tol)*(tb[0].max-tb[0].min)), - di1 = 2/((1+tol)*(tb[1].max-tb[1].min)), - di2 = 2/((1+tol)*(tb[2].max-tb[2].min)); - out->A[0]=di0*Ji[0], out->A[1]=di0*Ji[1], out->A[2]=di0*Ji[2]; - out->A[3]=di1*Ji[3], out->A[4]=di1*Ji[4], out->A[5]=di1*Ji[5]; - out->A[6]=di2*Ji[6], out->A[7]=di2*Ji[7], out->A[8]=di2*Ji[8]; - } - - } - } - - free(data); -} - diff --git a/3rdParty/gslib/src/obbox.h b/3rdParty/gslib/src/obbox.h deleted file mode 100644 index 8e5764fe2..000000000 --- a/3rdParty/gslib/src/obbox.h +++ /dev/null @@ -1,113 +0,0 @@ -#ifndef OBBOX_H -#define OBBOX_H - -#if !defined(TYPES_H) || !defined(NAME_H) -#warning "obbox.h" requires "types.h" and "name.h" -#endif - -#define obbox_calc_2 PREFIXED_NAME(obbox_calc_2) -#define obbox_calc_3 PREFIXED_NAME(obbox_calc_3) - -/*-------------------------------------------------------------------------- - Oriented and axis-aligned bounding box computation for spectral elements - - Usage: - - double x[n][nt][ns][nr], y[n][nt][ns][nr], z[n][nt][ns][nr]; - obbox_3 ob[n]; - - unsigned mr=4*nr, ms=4*ns, mt=4*nt; - double tol = 1e-6; - obbox_3_calc(ob, x,y,z, nr,ns,nt,n, mr,ms,mt, tol); - - The parameters mr,ms,mt specify number of points to use in computing - bounds (see lob_bnd.h). It is expected that mr>nr, etc. For reasonable - quality, a factor of at least 2 is recommended. - - tol is a relative amount by which to expand the bounding box. - This would accommodate, e.g., rounding errors. - - The axis aligned bounds for a given element are - ob[i].x.min <= x <= ob[i].x.max - ob[i].y.min <= y <= ob[i].y.max - ob[i].z.min <= z <= ob[i].z.max - - The oriented bounding box is given by - (-1,-1,-1)^T <= ob[i].A * (x - ob[i].c0) <= (1,1,1) - - where the matrix is row-major format, - dx = x - c0[0], dy = y - c0[1], dz = z - c0[2] - -1 <= r[0] = A[0]*dx + A[1]*dy + A[2]*dz <= 1 - -1 <= r[1] = A[3]*dx + A[4]*dy + A[5]*dz <= 1 - -1 <= r[2] = A[6]*dx + A[7]*dy + A[8]*dz <= 1 - - Also, ob[i].A * (x - ob[i].c0) should be a reasonable seed for Newton's. - - --------------------------------------------------------------------------*/ - -#ifndef LOB_BND_H -struct dbl_range { double min, max; }; -#endif - -struct obbox_2 { double c0[2], A[4]; - struct dbl_range x[2]; }; - -struct obbox_3 { double c0[3], A[9]; - struct dbl_range x[3]; }; - -void obbox_calc_2(struct obbox_2 *out, - const double *const elx[2], - const unsigned n[2], uint nel, - const unsigned m[2], const double tol); - -void obbox_calc_3(struct obbox_3 *out, - const double *const elx[3], - const unsigned n[3], uint nel, - const unsigned m[3], const double tol); - -/* positive when possibly inside */ -static double obbox_axis_test_2(const struct obbox_2 *const b, - const double x[2]) -{ - const double bx = (x[0]-b->x[0].min)*(b->x[0].max-x[0]); - return bx<0 ? bx : (x[1]-b->x[1].min)*(b->x[1].max-x[1]); -} - -/* positive when possibly inside */ -static double obbox_test_2(const struct obbox_2 *const b, const double x[2]) -{ - const double bxy = obbox_axis_test_2(b,x); - if(bxy<0) return bxy; else { - const double dx = x[0]-b->c0[0], dy = x[1]-b->c0[1]; - const double r = b->A[0]*dx + b->A[1]*dy, - s = b->A[2]*dx + b->A[3]*dy; - const double br = (r+1)*(1-r); - return br<0 ? br : (s+1)*(1-s); - } -} - -/* positive when possibly inside */ -static double obbox_axis_test_3(const struct obbox_3 *const b, - const double x[3]) -{ - const double bx = (x[0]-b->x[0].min)*(b->x[0].max-x[0]); - const double by = (x[1]-b->x[1].min)*(b->x[1].max-x[1]); - return bx<0 ? bx : (by<0 ? by : (x[2]-b->x[2].min)*(b->x[2].max-x[2])); -} - -/* positive when possibly inside */ -static double obbox_test_3(const struct obbox_3 *const b, const double x[3]) -{ - const double bxyz = obbox_axis_test_3(b,x); - if(bxyz<0) return bxyz; else { - const double dx = x[0]-b->c0[0], dy = x[1]-b->c0[1], dz = x[2]-b->c0[2]; - const double r = b->A[0]*dx + b->A[1]*dy + b->A[2]*dz, - s = b->A[3]*dx + b->A[4]*dy + b->A[5]*dz, - t = b->A[6]*dx + b->A[7]*dy + b->A[8]*dz; - const double br = (r+1)*(1-r), bs = (s+1)*(1-s); - return br<0 ? br : (bs<0 ? bs : (t+1)*(1-t)); - } -} - -#endif - diff --git a/3rdParty/gslib/src/poly.c b/3rdParty/gslib/src/poly.c deleted file mode 100644 index 00ad22b11..000000000 --- a/3rdParty/gslib/src/poly.c +++ /dev/null @@ -1,236 +0,0 @@ -#include -#include -#include /* for cos, fabs */ -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "mem.h" - -#define lagrange_size PREFIXED_NAME(lagrange_size ) -#define lagrange_setup PREFIXED_NAME(lagrange_setup) -#define gauss_nodes PREFIXED_NAME(gauss_nodes ) -#define gauss_quad PREFIXED_NAME(gauss_quad ) -#define lobatto_nodes PREFIXED_NAME(lobatto_nodes ) -#define lobatto_quad PREFIXED_NAME(lobatto_quad ) -#define gll_lag_size PREFIXED_NAME(gll_lag_size ) -#define gll_lag_setup PREFIXED_NAME(gll_lag_setup ) - -typedef void lagrange_fun(double *restrict p, - double *restrict data, unsigned n, int d, double x); - -#include "poly_imp.h" - -static void lagrange_eval(double *restrict p, - double *restrict data, unsigned n, int der, double x) -{{ - unsigned i; - const double *restrict z=data, *restrict w=z+n; - double *restrict d=data+2*n, *restrict u0=d+n, *restrict v0=u0+n; - for(i=0;i0) { - double *restrict p1 = p+n, *restrict u1=v0+n, *restrict v1=u1+n; - u1[0 ]=0; for(i=0 ;i1) { - double *restrict p2 = p1+n, *restrict u2=v1+n, *restrict v2=u2+n; - u2[0 ]=0; for(i=0 ;i Gauss-Legendre quadrature (open) - Lobatto -> Gauss-Lobatto-Legendre quadrature (closed at both ends) - Polynomial bases - Legendre -> Legendre basis - Gauss -> Lagrangian basis using Gauss quadrature nodes - Lobatto -> Lagrangian basis using Lobatto quadrature nodes -*/ - -/*-------------------------------------------------------------------------- - Legendre Polynomial Computation - compute P_n(x) or P_n'(x) or P_n''(x) - --------------------------------------------------------------------------*/ - -/* precondition: n >= 0 */ -static double legendre(int n, double x) -{ - double p[2]; - double i, nn=n-0.5; /* avoid int -> double conversions */ - p[0]=1.,p[1]=x; - for(i=1; i 0 */ -static double legendre_d1(int n, double x) -{ - double p[2]; - double i, nn=n-0.5; /* avoid int -> double conversions */ - p[0]=3*x,p[1]=1; - for(i=2; i 1 */ -static double legendre_d2(int n, double x) -{ - double p[2]; - double i, nn=n-0.5; /* avoid int -> double conversions */ - p[0]=3,p[1]=15*x; - for(i=3; i-x*EPS); - z[i] = x - legendre(n,x)/legendre_d1(n,x); - } - if(n&1) z[n/2]=0; - for(j=(n+1)/2,i=n/2-1; j-x*EPS); - z[i] = x - legendre_d1(np,x)/legendre_d2(np,x); - } - if(n&1) z[n/2]=0; - for(j=(n+1)/2,i=n/2-1; j=4) { - const double *restrict gllz = gllz_table[n-4]; - int i,j; - for(i=1;i<=n/2-1;++i) z[i] = -gllz[i-1]; - for(j=(n+1)/2,i=n/2-1; jGLL_LAG_FIX_MAX) lobatto_nodes_n(z,n); - else if(n>=2) lobatto_nodes_fix(z,n); -} - -void gauss_quad(double *restrict z, double *restrict w, int n) -{ - int i,j; - gauss_nodes(z,n); - for(i=0; i<=(n-1)/2; ++i) { - double d = (n+1)*legendre(n+1,z[i]); - w[i] = 2*(1-z[i]*z[i])/(d*d); - } - for(j=(n+1)/2,i=n/2-1; j Gauss-Legendre quadrature (open) - Lobatto -> Gauss-Lobatto-Legendre quadrature (closed at both ends) - - the _quad functions compute both nodes and weights - --------------------------------------------------------------------------*/ - -void gauss_nodes(double *restrict z, int n); /* n nodes (order = 2n-1) */ -void lobatto_nodes(double *restrict z, int n); /* n nodes (order = 2n-3) */ - -void gauss_quad(double *restrict z, double *restrict w, int n); -void lobatto_quad(double *restrict z, double *restrict w, int n); - -/*-------------------------------------------------------------------------- - Lagrangian basis function evaluation - - Usage: - - double z[N] = ..., x = ...; // nodes and evaluation point - double p[3*N]; - double *data = tmalloc(double, lagrange_size(N)); - lagrange_fun *const lag = lagrange_setup(data, z, N); - - int d = ...; // 0, 1, or 2 --- the highest derivative to compute - lag(p, data,N,d, x); - // now p[i] = h_i(x), 0 <= i < N - // if d>=1, p[N+i] = h_i'(x) - // if d>=2, p[2*N+i] = h_i''(x) - free(data); - - gll_lag_* are similar, but are specialized for GLL nodes, and faster, - and also don't need to be given the nodal locations - --------------------------------------------------------------------------*/ - -typedef void lagrange_fun(double *restrict p, - double *restrict data, unsigned n, int d, double x); - -unsigned lagrange_size(unsigned n); -lagrange_fun *lagrange_setup( - double *restrict data, const double *restrict z, unsigned n); - -unsigned gll_lag_size(unsigned n); -lagrange_fun *gll_lag_setup(double *restrict data, int n); - - -#endif - diff --git a/3rdParty/gslib/src/poly_imp.h b/3rdParty/gslib/src/poly_imp.h deleted file mode 100644 index 6ca55a7a4..000000000 --- a/3rdParty/gslib/src/poly_imp.h +++ /dev/null @@ -1,1949 +0,0 @@ -/* generated by gen_poly_imp.c */ - -#define GLL_LAG_FIX_MAX 24 - -static void gll_lag_02(double *restrict p, double *restrict w, - unsigned n, int d, double xh) -{ - const double x = xh*2; - const double d00=x+2 ,d01=x-2 ; - const double u0_01= 1*d00; - const double v0_00=d01* 1; - p[ 0]=w[ 0]* 1*v0_00; p[ 1]=w[ 1]*u0_01* 1; - if(d>0) { - p[2+ 0]=2*w[ 0]*( 1); - p[2+ 1]=2*w[ 1]*( 1 ); - if(d>1) { - p[2*2+ 0]=0; - p[2*2+ 1]=0; - } - } -} - -static void gll_lag_03(double *restrict p, double *restrict w, - unsigned n, int d, double xh) -{ - const double x = xh*2; - const double d00=x+2 ,d01=x ,d02=x-2 ; - const double u0_01= 1*d00,u0_02=u0_01*d01; - const double v0_01=d02* 1,v0_00=d01*v0_01; - p[ 0]=w[ 0]* 1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02* 1; - if(d>0) { - const double u1_02= 1*d01+u0_01; - const double v1_00=d01* 1+v0_01; - p[3+ 0]=2*w[ 0]*( v1_00); - p[3+ 1]=2*w[ 1]*( 1*v0_01+u0_01* 1); - p[3+ 2]=2*w[ 2]*(u1_02 ); - if(d>1) { - p[2*3+ 0]=4*w[ 0]*( + 1* 2); - p[2*3+ 1]=4*w[ 1]*( +2* 1* 1 ); - p[2*3+ 2]=4*w[ 2]*( 2* 1 ); - } - } -} - -static const double gllz_04[ 1] = { - 0.44721359549995793928183473374625524708812367192231 -}; - -static void gll_lag_04(double *restrict p, double *restrict w, - unsigned n, int d, double xh) -{ - const double x = xh*2; - const double d00=x+2 ,d01=x+2*gllz_04[ 0],d02=x-2*gllz_04[ 0], - d03=x-2 ; - const double u0_01= 1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02; - const double v0_02=d03* 1,v0_01=d02*v0_02,v0_00=d01*v0_01; - p[ 0]=w[ 0]* 1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02; - p[ 3]=w[ 3]*u0_03* 1; - if(d>0) { - const double u1_02= 1*d01+u0_01,u1_03=u1_02*d02+u0_02; - const double v1_01=d02* 1+v0_02,v1_00=d01*v1_01+v0_01; - p[4+ 0]=2*w[ 0]*( v1_00); - p[4+ 1]=2*w[ 1]*( 1*v0_01+u0_01*v1_01); - p[4+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02* 1); - p[4+ 3]=2*w[ 3]*(u1_03 ); - if(d>1) { - const double u2_03= 2*d02+2*u1_02; - const double v2_00=d01* 2+2*v1_01; - p[2*4+ 0]=4*w[ 0]*( + 1*v2_00); - p[2*4+ 1]=4*w[ 1]*( +2* 1*v1_01+u0_01* 2); - p[2*4+ 2]=4*w[ 2]*( 2*v0_02+2*u1_02* 1 ); - p[2*4+ 3]=4*w[ 3]*(u2_03* 1 ); - } - } -} - -static const double gllz_05[ 1] = { - 0.65465367070797714379829245624685835556920808239542 -}; - -static void gll_lag_05(double *restrict p, double *restrict w, - unsigned n, int d, double xh) -{ - const double x = xh*2; - const double d00=x+2 ,d01=x+2*gllz_05[ 0],d02=x , - d03=x-2*gllz_05[ 0],d04=x-2 ; - const double u0_01= 1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02, - u0_04=u0_03*d03; - const double v0_03=d04* 1,v0_02=d03*v0_03,v0_01=d02*v0_02, - v0_00=d01*v0_01; - p[ 0]=w[ 0]* 1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02; - p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04* 1; - if(d>0) { - const double u1_02= 1*d01+u0_01,u1_03=u1_02*d02+u0_02, - u1_04=u1_03*d03+u0_03; - const double v1_02=d03* 1+v0_03,v1_01=d02*v1_02+v0_02, - v1_00=d01*v1_01+v0_01; - p[5+ 0]=2*w[ 0]*( v1_00); - p[5+ 1]=2*w[ 1]*( 1*v0_01+u0_01*v1_01); - p[5+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02); - p[5+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03* 1); - p[5+ 4]=2*w[ 4]*(u1_04 ); - if(d>1) { - const double u2_03= 2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03; - const double v2_01=d02* 2+2*v1_02,v2_00=d01*v2_01+2*v1_01; - p[2*5+ 0]=4*w[ 0]*( + 1*v2_00); - p[2*5+ 1]=4*w[ 1]*( +2* 1*v1_01+u0_01*v2_01); - p[2*5+ 2]=4*w[ 2]*( 2*v0_02+2*u1_02*v1_02+u0_02* 2); - p[2*5+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03* 1 ); - p[2*5+ 4]=4*w[ 4]*(u2_04* 1 ); - } - } -} - -static const double gllz_06[ 2] = { - 0.7650553239294646928510029739593381503657356885361, - 0.28523151648064509631415099404087907191900347272643 -}; - -static void gll_lag_06(double *restrict p, double *restrict w, - unsigned n, int d, double xh) -{ - const double x = xh*2; - const double d00=x+2 ,d01=x+2*gllz_06[ 0],d02=x+2*gllz_06[ 1], - d03=x-2*gllz_06[ 1],d04=x-2*gllz_06[ 0],d05=x-2 ; - const double u0_01= 1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02, - u0_04=u0_03*d03,u0_05=u0_04*d04; - const double v0_04=d05* 1,v0_03=d04*v0_04,v0_02=d03*v0_03, - v0_01=d02*v0_02,v0_00=d01*v0_01; - p[ 0]=w[ 0]* 1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02; - p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05* 1; - if(d>0) { - const double u1_02= 1*d01+u0_01,u1_03=u1_02*d02+u0_02, - u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04; - const double v1_03=d04* 1+v0_04,v1_02=d03*v1_03+v0_03, - v1_01=d02*v1_02+v0_02,v1_00=d01*v1_01+v0_01; - p[6+ 0]=2*w[ 0]*( v1_00); - p[6+ 1]=2*w[ 1]*( 1*v0_01+u0_01*v1_01); - p[6+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02); - p[6+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03); - p[6+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04* 1); - p[6+ 5]=2*w[ 5]*(u1_05 ); - if(d>1) { - const double u2_03= 2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03, - u2_05=u2_04*d04+2*u1_04; - const double v2_02=d03* 2+2*v1_03,v2_01=d02*v2_02+2*v1_02, - v2_00=d01*v2_01+2*v1_01; - p[2*6+ 0]=4*w[ 0]*( + 1*v2_00); - p[2*6+ 1]=4*w[ 1]*( +2* 1*v1_01+u0_01*v2_01); - p[2*6+ 2]=4*w[ 2]*( 2*v0_02+2*u1_02*v1_02+u0_02*v2_02); - p[2*6+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03* 2); - p[2*6+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04* 1 ); - p[2*6+ 5]=4*w[ 5]*(u2_05* 1 ); - } - } -} - -static const double gllz_07[ 2] = { - 0.830223896278566929872032213967465139587170364872, - 0.46884879347071421380377188190876632940559747167184 -}; - -static void gll_lag_07(double *restrict p, double *restrict w, - unsigned n, int d, double xh) -{ - const double x = xh*2; - const double d00=x+2 ,d01=x+2*gllz_07[ 0],d02=x+2*gllz_07[ 1], - d03=x ,d04=x-2*gllz_07[ 1],d05=x-2*gllz_07[ 0], - d06=x-2 ; - const double u0_01= 1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02, - u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05; - const double v0_05=d06* 1,v0_04=d05*v0_05,v0_03=d04*v0_04, - v0_02=d03*v0_03,v0_01=d02*v0_02,v0_00=d01*v0_01; - p[ 0]=w[ 0]* 1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02; - p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05; - p[ 6]=w[ 6]*u0_06* 1; - if(d>0) { - const double u1_02= 1*d01+u0_01,u1_03=u1_02*d02+u0_02, - u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04, - u1_06=u1_05*d05+u0_05; - const double v1_04=d05* 1+v0_05,v1_03=d04*v1_04+v0_04, - v1_02=d03*v1_03+v0_03,v1_01=d02*v1_02+v0_02, - v1_00=d01*v1_01+v0_01; - p[7+ 0]=2*w[ 0]*( v1_00); - p[7+ 1]=2*w[ 1]*( 1*v0_01+u0_01*v1_01); - p[7+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02); - p[7+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03); - p[7+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04); - p[7+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05* 1); - p[7+ 6]=2*w[ 6]*(u1_06 ); - if(d>1) { - const double u2_03= 2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03, - u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05; - const double v2_03=d04* 2+2*v1_04,v2_02=d03*v2_03+2*v1_03, - v2_01=d02*v2_02+2*v1_02,v2_00=d01*v2_01+2*v1_01; - p[2*7+ 0]=4*w[ 0]*( + 1*v2_00); - p[2*7+ 1]=4*w[ 1]*( +2* 1*v1_01+u0_01*v2_01); - p[2*7+ 2]=4*w[ 2]*( 2*v0_02+2*u1_02*v1_02+u0_02*v2_02); - p[2*7+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03); - p[2*7+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04* 2); - p[2*7+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05* 1 ); - p[2*7+ 6]=4*w[ 6]*(u2_06* 1 ); - } - } -} - -static const double gllz_08[ 3] = { - 0.87174014850960661533744576122066343810378066967698, - 0.59170018143314230214451073139795318994570098951733, - 0.20929921790247886876865726034535125529554540508668 -}; - -static void gll_lag_08(double *restrict p, double *restrict w, - unsigned n, int d, double xh) -{ - const double x = xh*2; - const double d00=x+2 ,d01=x+2*gllz_08[ 0],d02=x+2*gllz_08[ 1], - d03=x+2*gllz_08[ 2],d04=x-2*gllz_08[ 2],d05=x-2*gllz_08[ 1], - d06=x-2*gllz_08[ 0],d07=x-2 ; - const double u0_01= 1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02, - u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05, - u0_07=u0_06*d06; - const double v0_06=d07* 1,v0_05=d06*v0_06,v0_04=d05*v0_05, - v0_03=d04*v0_04,v0_02=d03*v0_03,v0_01=d02*v0_02, - v0_00=d01*v0_01; - p[ 0]=w[ 0]* 1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02; - p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05; - p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07* 1; - if(d>0) { - const double u1_02= 1*d01+u0_01,u1_03=u1_02*d02+u0_02, - u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04, - u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06; - const double v1_05=d06* 1+v0_06,v1_04=d05*v1_05+v0_05, - v1_03=d04*v1_04+v0_04,v1_02=d03*v1_03+v0_03, - v1_01=d02*v1_02+v0_02,v1_00=d01*v1_01+v0_01; - p[8+ 0]=2*w[ 0]*( v1_00); - p[8+ 1]=2*w[ 1]*( 1*v0_01+u0_01*v1_01); - p[8+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02); - p[8+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03); - p[8+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04); - p[8+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05); - p[8+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06* 1); - p[8+ 7]=2*w[ 7]*(u1_07 ); - if(d>1) { - const double u2_03= 2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03, - u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05, - u2_07=u2_06*d06+2*u1_06; - const double v2_04=d05* 2+2*v1_05,v2_03=d04*v2_04+2*v1_04, - v2_02=d03*v2_03+2*v1_03,v2_01=d02*v2_02+2*v1_02, - v2_00=d01*v2_01+2*v1_01; - p[2*8+ 0]=4*w[ 0]*( + 1*v2_00); - p[2*8+ 1]=4*w[ 1]*( +2* 1*v1_01+u0_01*v2_01); - p[2*8+ 2]=4*w[ 2]*( 2*v0_02+2*u1_02*v1_02+u0_02*v2_02); - p[2*8+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03); - p[2*8+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04); - p[2*8+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05* 2); - p[2*8+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06* 1 ); - p[2*8+ 7]=4*w[ 7]*(u2_07* 1 ); - } - } -} - -static const double gllz_09[ 3] = { - 0.8997579954114601573123452444183379580514802955661, - 0.67718627951073775344588542709134245071102964761391, - 0.36311746382617815871075206870865921302064227760088 -}; - -static void gll_lag_09(double *restrict p, double *restrict w, - unsigned n, int d, double xh) -{ - const double x = xh*2; - const double d00=x+2 ,d01=x+2*gllz_09[ 0],d02=x+2*gllz_09[ 1], - d03=x+2*gllz_09[ 2],d04=x ,d05=x-2*gllz_09[ 2], - d06=x-2*gllz_09[ 1],d07=x-2*gllz_09[ 0],d08=x-2 ; - const double u0_01= 1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02, - u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05, - u0_07=u0_06*d06,u0_08=u0_07*d07; - const double v0_07=d08* 1,v0_06=d07*v0_07,v0_05=d06*v0_06, - v0_04=d05*v0_05,v0_03=d04*v0_04,v0_02=d03*v0_03, - v0_01=d02*v0_02,v0_00=d01*v0_01; - p[ 0]=w[ 0]* 1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02; - p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05; - p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08* 1; - if(d>0) { - const double u1_02= 1*d01+u0_01,u1_03=u1_02*d02+u0_02, - u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04, - u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06, - u1_08=u1_07*d07+u0_07; - const double v1_06=d07* 1+v0_07,v1_05=d06*v1_06+v0_06, - v1_04=d05*v1_05+v0_05,v1_03=d04*v1_04+v0_04, - v1_02=d03*v1_03+v0_03,v1_01=d02*v1_02+v0_02, - v1_00=d01*v1_01+v0_01; - p[9+ 0]=2*w[ 0]*( v1_00); - p[9+ 1]=2*w[ 1]*( 1*v0_01+u0_01*v1_01); - p[9+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02); - p[9+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03); - p[9+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04); - p[9+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05); - p[9+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06); - p[9+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07* 1); - p[9+ 8]=2*w[ 8]*(u1_08 ); - if(d>1) { - const double u2_03= 2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03, - u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05, - u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07; - const double v2_05=d06* 2+2*v1_06,v2_04=d05*v2_05+2*v1_05, - v2_03=d04*v2_04+2*v1_04,v2_02=d03*v2_03+2*v1_03, - v2_01=d02*v2_02+2*v1_02,v2_00=d01*v2_01+2*v1_01; - p[2*9+ 0]=4*w[ 0]*( + 1*v2_00); - p[2*9+ 1]=4*w[ 1]*( +2* 1*v1_01+u0_01*v2_01); - p[2*9+ 2]=4*w[ 2]*( 2*v0_02+2*u1_02*v1_02+u0_02*v2_02); - p[2*9+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03); - p[2*9+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04); - p[2*9+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05); - p[2*9+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06* 2); - p[2*9+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07* 1 ); - p[2*9+ 8]=4*w[ 8]*(u2_08* 1 ); - } - } -} - -static const double gllz_10[ 4] = { - 0.91953390816645881382893266082233813415354307544628, - 0.73877386510550507500310617485983072501618510137693, - 0.47792494981044449566117509273125799788677289333057, - 0.16527895766638702462621976595817353323115034354948 -}; - -static void gll_lag_10(double *restrict p, double *restrict w, - unsigned n, int d, double xh) -{ - const double x = xh*2; - const double d00=x+2 ,d01=x+2*gllz_10[ 0],d02=x+2*gllz_10[ 1], - d03=x+2*gllz_10[ 2],d04=x+2*gllz_10[ 3],d05=x-2*gllz_10[ 3], - d06=x-2*gllz_10[ 2],d07=x-2*gllz_10[ 1],d08=x-2*gllz_10[ 0], - d09=x-2 ; - const double u0_01= 1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02, - u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05, - u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08; - const double v0_08=d09* 1,v0_07=d08*v0_08,v0_06=d07*v0_07, - v0_05=d06*v0_06,v0_04=d05*v0_05,v0_03=d04*v0_04, - v0_02=d03*v0_03,v0_01=d02*v0_02,v0_00=d01*v0_01; - p[ 0]=w[ 0]* 1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02; - p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05; - p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08; - p[ 9]=w[ 9]*u0_09* 1; - if(d>0) { - const double u1_02= 1*d01+u0_01,u1_03=u1_02*d02+u0_02, - u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04, - u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06, - u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08; - const double v1_07=d08* 1+v0_08,v1_06=d07*v1_07+v0_07, - v1_05=d06*v1_06+v0_06,v1_04=d05*v1_05+v0_05, - v1_03=d04*v1_04+v0_04,v1_02=d03*v1_03+v0_03, - v1_01=d02*v1_02+v0_02,v1_00=d01*v1_01+v0_01; - p[10+ 0]=2*w[ 0]*( v1_00); - p[10+ 1]=2*w[ 1]*( 1*v0_01+u0_01*v1_01); - p[10+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02); - p[10+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03); - p[10+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04); - p[10+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05); - p[10+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06); - p[10+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07); - p[10+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08* 1); - p[10+ 9]=2*w[ 9]*(u1_09 ); - if(d>1) { - const double u2_03= 2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03, - u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05, - u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07, - u2_09=u2_08*d08+2*u1_08; - const double v2_06=d07* 2+2*v1_07,v2_05=d06*v2_06+2*v1_06, - v2_04=d05*v2_05+2*v1_05,v2_03=d04*v2_04+2*v1_04, - v2_02=d03*v2_03+2*v1_03,v2_01=d02*v2_02+2*v1_02, - v2_00=d01*v2_01+2*v1_01; - p[2*10+ 0]=4*w[ 0]*( + 1*v2_00); - p[2*10+ 1]=4*w[ 1]*( +2* 1*v1_01+u0_01*v2_01); - p[2*10+ 2]=4*w[ 2]*( 2*v0_02+2*u1_02*v1_02+u0_02*v2_02); - p[2*10+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03); - p[2*10+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04); - p[2*10+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05); - p[2*10+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06); - p[2*10+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07* 2); - p[2*10+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08* 1 ); - p[2*10+ 9]=4*w[ 9]*(u2_09* 1 ); - } - } -} - -static const double gllz_11[ 4] = { - 0.93400143040805913433227413609938363453991733010996, - 0.78448347366314441862241781610845810350719745509406, - 0.56523532699620500647096396947775166428305214556202, - 0.2957581355869393914319115155590575089410064343486 -}; - -static void gll_lag_11(double *restrict p, double *restrict w, - unsigned n, int d, double xh) -{ - const double x = xh*2; - const double d00=x+2 ,d01=x+2*gllz_11[ 0],d02=x+2*gllz_11[ 1], - d03=x+2*gllz_11[ 2],d04=x+2*gllz_11[ 3],d05=x , - d06=x-2*gllz_11[ 3],d07=x-2*gllz_11[ 2],d08=x-2*gllz_11[ 1], - d09=x-2*gllz_11[ 0],d10=x-2 ; - const double u0_01= 1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02, - u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05, - u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08, - u0_10=u0_09*d09; - const double v0_09=d10* 1,v0_08=d09*v0_09,v0_07=d08*v0_08, - v0_06=d07*v0_07,v0_05=d06*v0_06,v0_04=d05*v0_05, - v0_03=d04*v0_04,v0_02=d03*v0_03,v0_01=d02*v0_02, - v0_00=d01*v0_01; - p[ 0]=w[ 0]* 1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02; - p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05; - p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08; - p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10* 1; - if(d>0) { - const double u1_02= 1*d01+u0_01,u1_03=u1_02*d02+u0_02, - u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04, - u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06, - u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08, - u1_10=u1_09*d09+u0_09; - const double v1_08=d09* 1+v0_09,v1_07=d08*v1_08+v0_08, - v1_06=d07*v1_07+v0_07,v1_05=d06*v1_06+v0_06, - v1_04=d05*v1_05+v0_05,v1_03=d04*v1_04+v0_04, - v1_02=d03*v1_03+v0_03,v1_01=d02*v1_02+v0_02, - v1_00=d01*v1_01+v0_01; - p[11+ 0]=2*w[ 0]*( v1_00); - p[11+ 1]=2*w[ 1]*( 1*v0_01+u0_01*v1_01); - p[11+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02); - p[11+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03); - p[11+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04); - p[11+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05); - p[11+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06); - p[11+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07); - p[11+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08); - p[11+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09* 1); - p[11+10]=2*w[10]*(u1_10 ); - if(d>1) { - const double u2_03= 2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03, - u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05, - u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07, - u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09; - const double v2_07=d08* 2+2*v1_08,v2_06=d07*v2_07+2*v1_07, - v2_05=d06*v2_06+2*v1_06,v2_04=d05*v2_05+2*v1_05, - v2_03=d04*v2_04+2*v1_04,v2_02=d03*v2_03+2*v1_03, - v2_01=d02*v2_02+2*v1_02,v2_00=d01*v2_01+2*v1_01; - p[2*11+ 0]=4*w[ 0]*( + 1*v2_00); - p[2*11+ 1]=4*w[ 1]*( +2* 1*v1_01+u0_01*v2_01); - p[2*11+ 2]=4*w[ 2]*( 2*v0_02+2*u1_02*v1_02+u0_02*v2_02); - p[2*11+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03); - p[2*11+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04); - p[2*11+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05); - p[2*11+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06); - p[2*11+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07); - p[2*11+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08* 2); - p[2*11+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09* 1 ); - p[2*11+10]=4*w[10]*(u2_10* 1 ); - } - } -} - -static const double gllz_12[ 5] = { - 0.94489927222288222340758013830321871361125655195003, - 0.81927932164400667834864158171690266069046665790364, - 0.6328761530318606776624048544436558582438437454015, - 0.39953094096534893226434979156696690052774803279531, - 0.13655293285492755486406185573969389689841411128206 -}; - -static void gll_lag_12(double *restrict p, double *restrict w, - unsigned n, int d, double xh) -{ - const double x = xh*2; - const double d00=x+2 ,d01=x+2*gllz_12[ 0],d02=x+2*gllz_12[ 1], - d03=x+2*gllz_12[ 2],d04=x+2*gllz_12[ 3],d05=x+2*gllz_12[ 4], - d06=x-2*gllz_12[ 4],d07=x-2*gllz_12[ 3],d08=x-2*gllz_12[ 2], - d09=x-2*gllz_12[ 1],d10=x-2*gllz_12[ 0],d11=x-2 ; - const double u0_01= 1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02, - u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05, - u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08, - u0_10=u0_09*d09,u0_11=u0_10*d10; - const double v0_10=d11* 1,v0_09=d10*v0_10,v0_08=d09*v0_09, - v0_07=d08*v0_08,v0_06=d07*v0_07,v0_05=d06*v0_06, - v0_04=d05*v0_05,v0_03=d04*v0_04,v0_02=d03*v0_03, - v0_01=d02*v0_02,v0_00=d01*v0_01; - p[ 0]=w[ 0]* 1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02; - p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05; - p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08; - p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11* 1; - if(d>0) { - const double u1_02= 1*d01+u0_01,u1_03=u1_02*d02+u0_02, - u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04, - u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06, - u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08, - u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10; - const double v1_09=d10* 1+v0_10,v1_08=d09*v1_09+v0_09, - v1_07=d08*v1_08+v0_08,v1_06=d07*v1_07+v0_07, - v1_05=d06*v1_06+v0_06,v1_04=d05*v1_05+v0_05, - v1_03=d04*v1_04+v0_04,v1_02=d03*v1_03+v0_03, - v1_01=d02*v1_02+v0_02,v1_00=d01*v1_01+v0_01; - p[12+ 0]=2*w[ 0]*( v1_00); - p[12+ 1]=2*w[ 1]*( 1*v0_01+u0_01*v1_01); - p[12+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02); - p[12+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03); - p[12+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04); - p[12+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05); - p[12+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06); - p[12+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07); - p[12+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08); - p[12+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09); - p[12+10]=2*w[10]*(u1_10*v0_10+u0_10* 1); - p[12+11]=2*w[11]*(u1_11 ); - if(d>1) { - const double u2_03= 2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03, - u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05, - u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07, - u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09, - u2_11=u2_10*d10+2*u1_10; - const double v2_08=d09* 2+2*v1_09,v2_07=d08*v2_08+2*v1_08, - v2_06=d07*v2_07+2*v1_07,v2_05=d06*v2_06+2*v1_06, - v2_04=d05*v2_05+2*v1_05,v2_03=d04*v2_04+2*v1_04, - v2_02=d03*v2_03+2*v1_03,v2_01=d02*v2_02+2*v1_02, - v2_00=d01*v2_01+2*v1_01; - p[2*12+ 0]=4*w[ 0]*( + 1*v2_00); - p[2*12+ 1]=4*w[ 1]*( +2* 1*v1_01+u0_01*v2_01); - p[2*12+ 2]=4*w[ 2]*( 2*v0_02+2*u1_02*v1_02+u0_02*v2_02); - p[2*12+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03); - p[2*12+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04); - p[2*12+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05); - p[2*12+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06); - p[2*12+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07); - p[2*12+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08); - p[2*12+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09* 2); - p[2*12+10]=4*w[10]*(u2_10*v0_10+2*u1_10* 1 ); - p[2*12+11]=4*w[11]*(u2_11* 1 ); - } - } -} - -static const double gllz_13[ 5] = { - 0.95330984664216391189690546475544915162650788869736, - 0.84634756465187231686592560709875335957803665971441, - 0.68618846908175742607275903956635555292917619812438, - 0.48290982109133620174693723363693362077219326211859, - 0.24928693010623999256867370037422698148881131249298 -}; - -static void gll_lag_13(double *restrict p, double *restrict w, - unsigned n, int d, double xh) -{ - const double x = xh*2; - const double d00=x+2 ,d01=x+2*gllz_13[ 0],d02=x+2*gllz_13[ 1], - d03=x+2*gllz_13[ 2],d04=x+2*gllz_13[ 3],d05=x+2*gllz_13[ 4], - d06=x ,d07=x-2*gllz_13[ 4],d08=x-2*gllz_13[ 3], - d09=x-2*gllz_13[ 2],d10=x-2*gllz_13[ 1],d11=x-2*gllz_13[ 0], - d12=x-2 ; - const double u0_01= 1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02, - u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05, - u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08, - u0_10=u0_09*d09,u0_11=u0_10*d10,u0_12=u0_11*d11; - const double v0_11=d12* 1,v0_10=d11*v0_11,v0_09=d10*v0_10, - v0_08=d09*v0_09,v0_07=d08*v0_08,v0_06=d07*v0_07, - v0_05=d06*v0_06,v0_04=d05*v0_05,v0_03=d04*v0_04, - v0_02=d03*v0_03,v0_01=d02*v0_02,v0_00=d01*v0_01; - p[ 0]=w[ 0]* 1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02; - p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05; - p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08; - p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11*v0_11; - p[12]=w[12]*u0_12* 1; - if(d>0) { - const double u1_02= 1*d01+u0_01,u1_03=u1_02*d02+u0_02, - u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04, - u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06, - u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08, - u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10, - u1_12=u1_11*d11+u0_11; - const double v1_10=d11* 1+v0_11,v1_09=d10*v1_10+v0_10, - v1_08=d09*v1_09+v0_09,v1_07=d08*v1_08+v0_08, - v1_06=d07*v1_07+v0_07,v1_05=d06*v1_06+v0_06, - v1_04=d05*v1_05+v0_05,v1_03=d04*v1_04+v0_04, - v1_02=d03*v1_03+v0_03,v1_01=d02*v1_02+v0_02, - v1_00=d01*v1_01+v0_01; - p[13+ 0]=2*w[ 0]*( v1_00); - p[13+ 1]=2*w[ 1]*( 1*v0_01+u0_01*v1_01); - p[13+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02); - p[13+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03); - p[13+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04); - p[13+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05); - p[13+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06); - p[13+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07); - p[13+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08); - p[13+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09); - p[13+10]=2*w[10]*(u1_10*v0_10+u0_10*v1_10); - p[13+11]=2*w[11]*(u1_11*v0_11+u0_11* 1); - p[13+12]=2*w[12]*(u1_12 ); - if(d>1) { - const double u2_03= 2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03, - u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05, - u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07, - u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09, - u2_11=u2_10*d10+2*u1_10,u2_12=u2_11*d11+2*u1_11; - const double v2_09=d10* 2+2*v1_10,v2_08=d09*v2_09+2*v1_09, - v2_07=d08*v2_08+2*v1_08,v2_06=d07*v2_07+2*v1_07, - v2_05=d06*v2_06+2*v1_06,v2_04=d05*v2_05+2*v1_05, - v2_03=d04*v2_04+2*v1_04,v2_02=d03*v2_03+2*v1_03, - v2_01=d02*v2_02+2*v1_02,v2_00=d01*v2_01+2*v1_01; - p[2*13+ 0]=4*w[ 0]*( + 1*v2_00); - p[2*13+ 1]=4*w[ 1]*( +2* 1*v1_01+u0_01*v2_01); - p[2*13+ 2]=4*w[ 2]*( 2*v0_02+2*u1_02*v1_02+u0_02*v2_02); - p[2*13+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03); - p[2*13+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04); - p[2*13+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05); - p[2*13+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06); - p[2*13+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07); - p[2*13+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08); - p[2*13+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09*v2_09); - p[2*13+10]=4*w[10]*(u2_10*v0_10+2*u1_10*v1_10+u0_10* 2); - p[2*13+11]=4*w[11]*(u2_11*v0_11+2*u1_11* 1 ); - p[2*13+12]=4*w[12]*(u2_12* 1 ); - } - } -} - -static const double gllz_14[ 6] = { - 0.95993504526726090135510016201542438906639151857265, - 0.86780105383034725100022020290826421324987235309444, - 0.72886859909132614058467240052088159565733953169432, - 0.55063940292864705531662270585908063446213831955391, - 0.34272401334271284504390340364167464483311353414031, - 0.11633186888370386765877670973616016794150904425628 -}; - -static void gll_lag_14(double *restrict p, double *restrict w, - unsigned n, int d, double xh) -{ - const double x = xh*2; - const double d00=x+2 ,d01=x+2*gllz_14[ 0],d02=x+2*gllz_14[ 1], - d03=x+2*gllz_14[ 2],d04=x+2*gllz_14[ 3],d05=x+2*gllz_14[ 4], - d06=x+2*gllz_14[ 5],d07=x-2*gllz_14[ 5],d08=x-2*gllz_14[ 4], - d09=x-2*gllz_14[ 3],d10=x-2*gllz_14[ 2],d11=x-2*gllz_14[ 1], - d12=x-2*gllz_14[ 0],d13=x-2 ; - const double u0_01= 1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02, - u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05, - u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08, - u0_10=u0_09*d09,u0_11=u0_10*d10,u0_12=u0_11*d11, - u0_13=u0_12*d12; - const double v0_12=d13* 1,v0_11=d12*v0_12,v0_10=d11*v0_11, - v0_09=d10*v0_10,v0_08=d09*v0_09,v0_07=d08*v0_08, - v0_06=d07*v0_07,v0_05=d06*v0_06,v0_04=d05*v0_05, - v0_03=d04*v0_04,v0_02=d03*v0_03,v0_01=d02*v0_02, - v0_00=d01*v0_01; - p[ 0]=w[ 0]* 1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02; - p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05; - p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08; - p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11*v0_11; - p[12]=w[12]*u0_12*v0_12; p[13]=w[13]*u0_13* 1; - if(d>0) { - const double u1_02= 1*d01+u0_01,u1_03=u1_02*d02+u0_02, - u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04, - u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06, - u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08, - u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10, - u1_12=u1_11*d11+u0_11,u1_13=u1_12*d12+u0_12; - const double v1_11=d12* 1+v0_12,v1_10=d11*v1_11+v0_11, - v1_09=d10*v1_10+v0_10,v1_08=d09*v1_09+v0_09, - v1_07=d08*v1_08+v0_08,v1_06=d07*v1_07+v0_07, - v1_05=d06*v1_06+v0_06,v1_04=d05*v1_05+v0_05, - v1_03=d04*v1_04+v0_04,v1_02=d03*v1_03+v0_03, - v1_01=d02*v1_02+v0_02,v1_00=d01*v1_01+v0_01; - p[14+ 0]=2*w[ 0]*( v1_00); - p[14+ 1]=2*w[ 1]*( 1*v0_01+u0_01*v1_01); - p[14+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02); - p[14+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03); - p[14+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04); - p[14+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05); - p[14+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06); - p[14+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07); - p[14+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08); - p[14+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09); - p[14+10]=2*w[10]*(u1_10*v0_10+u0_10*v1_10); - p[14+11]=2*w[11]*(u1_11*v0_11+u0_11*v1_11); - p[14+12]=2*w[12]*(u1_12*v0_12+u0_12* 1); - p[14+13]=2*w[13]*(u1_13 ); - if(d>1) { - const double u2_03= 2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03, - u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05, - u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07, - u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09, - u2_11=u2_10*d10+2*u1_10,u2_12=u2_11*d11+2*u1_11, - u2_13=u2_12*d12+2*u1_12; - const double v2_10=d11* 2+2*v1_11,v2_09=d10*v2_10+2*v1_10, - v2_08=d09*v2_09+2*v1_09,v2_07=d08*v2_08+2*v1_08, - v2_06=d07*v2_07+2*v1_07,v2_05=d06*v2_06+2*v1_06, - v2_04=d05*v2_05+2*v1_05,v2_03=d04*v2_04+2*v1_04, - v2_02=d03*v2_03+2*v1_03,v2_01=d02*v2_02+2*v1_02, - v2_00=d01*v2_01+2*v1_01; - p[2*14+ 0]=4*w[ 0]*( + 1*v2_00); - p[2*14+ 1]=4*w[ 1]*( +2* 1*v1_01+u0_01*v2_01); - p[2*14+ 2]=4*w[ 2]*( 2*v0_02+2*u1_02*v1_02+u0_02*v2_02); - p[2*14+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03); - p[2*14+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04); - p[2*14+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05); - p[2*14+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06); - p[2*14+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07); - p[2*14+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08); - p[2*14+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09*v2_09); - p[2*14+10]=4*w[10]*(u2_10*v0_10+2*u1_10*v1_10+u0_10*v2_10); - p[2*14+11]=4*w[11]*(u2_11*v0_11+2*u1_11*v1_11+u0_11* 2); - p[2*14+12]=4*w[12]*(u2_12*v0_12+2*u1_12* 1 ); - p[2*14+13]=4*w[13]*(u2_13* 1 ); - } - } -} - -static const double gllz_15[ 6] = { - 0.96524592650383857279585139206960117770765013599709, - 0.88508204422297629882540163148222965198871408520748, - 0.76351968995181520070411847597629161817736852031529, - 0.60625320546984571112352993863673350717973103375992, - 0.42063805471367248092189693873858041298433820549243, - 0.21535395536379423822567944627291771265215790120304 -}; - -static void gll_lag_15(double *restrict p, double *restrict w, - unsigned n, int d, double xh) -{ - const double x = xh*2; - const double d00=x+2 ,d01=x+2*gllz_15[ 0],d02=x+2*gllz_15[ 1], - d03=x+2*gllz_15[ 2],d04=x+2*gllz_15[ 3],d05=x+2*gllz_15[ 4], - d06=x+2*gllz_15[ 5],d07=x ,d08=x-2*gllz_15[ 5], - d09=x-2*gllz_15[ 4],d10=x-2*gllz_15[ 3],d11=x-2*gllz_15[ 2], - d12=x-2*gllz_15[ 1],d13=x-2*gllz_15[ 0],d14=x-2 ; - const double u0_01= 1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02, - u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05, - u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08, - u0_10=u0_09*d09,u0_11=u0_10*d10,u0_12=u0_11*d11, - u0_13=u0_12*d12,u0_14=u0_13*d13; - const double v0_13=d14* 1,v0_12=d13*v0_13,v0_11=d12*v0_12, - v0_10=d11*v0_11,v0_09=d10*v0_10,v0_08=d09*v0_09, - v0_07=d08*v0_08,v0_06=d07*v0_07,v0_05=d06*v0_06, - v0_04=d05*v0_05,v0_03=d04*v0_04,v0_02=d03*v0_03, - v0_01=d02*v0_02,v0_00=d01*v0_01; - p[ 0]=w[ 0]* 1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02; - p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05; - p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08; - p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11*v0_11; - p[12]=w[12]*u0_12*v0_12; p[13]=w[13]*u0_13*v0_13; p[14]=w[14]*u0_14* 1; - if(d>0) { - const double u1_02= 1*d01+u0_01,u1_03=u1_02*d02+u0_02, - u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04, - u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06, - u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08, - u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10, - u1_12=u1_11*d11+u0_11,u1_13=u1_12*d12+u0_12, - u1_14=u1_13*d13+u0_13; - const double v1_12=d13* 1+v0_13,v1_11=d12*v1_12+v0_12, - v1_10=d11*v1_11+v0_11,v1_09=d10*v1_10+v0_10, - v1_08=d09*v1_09+v0_09,v1_07=d08*v1_08+v0_08, - v1_06=d07*v1_07+v0_07,v1_05=d06*v1_06+v0_06, - v1_04=d05*v1_05+v0_05,v1_03=d04*v1_04+v0_04, - v1_02=d03*v1_03+v0_03,v1_01=d02*v1_02+v0_02, - v1_00=d01*v1_01+v0_01; - p[15+ 0]=2*w[ 0]*( v1_00); - p[15+ 1]=2*w[ 1]*( 1*v0_01+u0_01*v1_01); - p[15+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02); - p[15+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03); - p[15+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04); - p[15+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05); - p[15+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06); - p[15+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07); - p[15+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08); - p[15+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09); - p[15+10]=2*w[10]*(u1_10*v0_10+u0_10*v1_10); - p[15+11]=2*w[11]*(u1_11*v0_11+u0_11*v1_11); - p[15+12]=2*w[12]*(u1_12*v0_12+u0_12*v1_12); - p[15+13]=2*w[13]*(u1_13*v0_13+u0_13* 1); - p[15+14]=2*w[14]*(u1_14 ); - if(d>1) { - const double u2_03= 2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03, - u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05, - u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07, - u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09, - u2_11=u2_10*d10+2*u1_10,u2_12=u2_11*d11+2*u1_11, - u2_13=u2_12*d12+2*u1_12,u2_14=u2_13*d13+2*u1_13; - const double v2_11=d12* 2+2*v1_12,v2_10=d11*v2_11+2*v1_11, - v2_09=d10*v2_10+2*v1_10,v2_08=d09*v2_09+2*v1_09, - v2_07=d08*v2_08+2*v1_08,v2_06=d07*v2_07+2*v1_07, - v2_05=d06*v2_06+2*v1_06,v2_04=d05*v2_05+2*v1_05, - v2_03=d04*v2_04+2*v1_04,v2_02=d03*v2_03+2*v1_03, - v2_01=d02*v2_02+2*v1_02,v2_00=d01*v2_01+2*v1_01; - p[2*15+ 0]=4*w[ 0]*( + 1*v2_00); - p[2*15+ 1]=4*w[ 1]*( +2* 1*v1_01+u0_01*v2_01); - p[2*15+ 2]=4*w[ 2]*( 2*v0_02+2*u1_02*v1_02+u0_02*v2_02); - p[2*15+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03); - p[2*15+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04); - p[2*15+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05); - p[2*15+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06); - p[2*15+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07); - p[2*15+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08); - p[2*15+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09*v2_09); - p[2*15+10]=4*w[10]*(u2_10*v0_10+2*u1_10*v1_10+u0_10*v2_10); - p[2*15+11]=4*w[11]*(u2_11*v0_11+2*u1_11*v1_11+u0_11*v2_11); - p[2*15+12]=4*w[12]*(u2_12*v0_12+2*u1_12*v1_12+u0_12* 2); - p[2*15+13]=4*w[13]*(u2_13*v0_13+2*u1_13* 1 ); - p[2*15+14]=4*w[14]*(u2_14* 1 ); - } - } -} - -static const double gllz_16[ 7] = { - 0.96956804627021793295224273836745924138899074650383, - 0.89920053309347209299462826151984947674999760904514, - 0.7920082918618150639310882709631457058080738279802, - 0.65238870288249308946788321964058148032155801282957, - 0.48605942188713761178189078584687469688897730429825, - 0.29983046890076320809835345472230064781546097690778, - 0.10132627352194944784303300504591776253324091440019 -}; - -static void gll_lag_16(double *restrict p, double *restrict w, - unsigned n, int d, double xh) -{ - const double x = xh*2; - const double d00=x+2 ,d01=x+2*gllz_16[ 0],d02=x+2*gllz_16[ 1], - d03=x+2*gllz_16[ 2],d04=x+2*gllz_16[ 3],d05=x+2*gllz_16[ 4], - d06=x+2*gllz_16[ 5],d07=x+2*gllz_16[ 6],d08=x-2*gllz_16[ 6], - d09=x-2*gllz_16[ 5],d10=x-2*gllz_16[ 4],d11=x-2*gllz_16[ 3], - d12=x-2*gllz_16[ 2],d13=x-2*gllz_16[ 1],d14=x-2*gllz_16[ 0], - d15=x-2 ; - const double u0_01= 1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02, - u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05, - u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08, - u0_10=u0_09*d09,u0_11=u0_10*d10,u0_12=u0_11*d11, - u0_13=u0_12*d12,u0_14=u0_13*d13,u0_15=u0_14*d14; - const double v0_14=d15* 1,v0_13=d14*v0_14,v0_12=d13*v0_13, - v0_11=d12*v0_12,v0_10=d11*v0_11,v0_09=d10*v0_10, - v0_08=d09*v0_09,v0_07=d08*v0_08,v0_06=d07*v0_07, - v0_05=d06*v0_06,v0_04=d05*v0_05,v0_03=d04*v0_04, - v0_02=d03*v0_03,v0_01=d02*v0_02,v0_00=d01*v0_01; - p[ 0]=w[ 0]* 1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02; - p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05; - p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08; - p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11*v0_11; - p[12]=w[12]*u0_12*v0_12; p[13]=w[13]*u0_13*v0_13; p[14]=w[14]*u0_14*v0_14; - p[15]=w[15]*u0_15* 1; - if(d>0) { - const double u1_02= 1*d01+u0_01,u1_03=u1_02*d02+u0_02, - u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04, - u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06, - u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08, - u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10, - u1_12=u1_11*d11+u0_11,u1_13=u1_12*d12+u0_12, - u1_14=u1_13*d13+u0_13,u1_15=u1_14*d14+u0_14; - const double v1_13=d14* 1+v0_14,v1_12=d13*v1_13+v0_13, - v1_11=d12*v1_12+v0_12,v1_10=d11*v1_11+v0_11, - v1_09=d10*v1_10+v0_10,v1_08=d09*v1_09+v0_09, - v1_07=d08*v1_08+v0_08,v1_06=d07*v1_07+v0_07, - v1_05=d06*v1_06+v0_06,v1_04=d05*v1_05+v0_05, - v1_03=d04*v1_04+v0_04,v1_02=d03*v1_03+v0_03, - v1_01=d02*v1_02+v0_02,v1_00=d01*v1_01+v0_01; - p[16+ 0]=2*w[ 0]*( v1_00); - p[16+ 1]=2*w[ 1]*( 1*v0_01+u0_01*v1_01); - p[16+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02); - p[16+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03); - p[16+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04); - p[16+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05); - p[16+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06); - p[16+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07); - p[16+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08); - p[16+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09); - p[16+10]=2*w[10]*(u1_10*v0_10+u0_10*v1_10); - p[16+11]=2*w[11]*(u1_11*v0_11+u0_11*v1_11); - p[16+12]=2*w[12]*(u1_12*v0_12+u0_12*v1_12); - p[16+13]=2*w[13]*(u1_13*v0_13+u0_13*v1_13); - p[16+14]=2*w[14]*(u1_14*v0_14+u0_14* 1); - p[16+15]=2*w[15]*(u1_15 ); - if(d>1) { - const double u2_03= 2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03, - u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05, - u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07, - u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09, - u2_11=u2_10*d10+2*u1_10,u2_12=u2_11*d11+2*u1_11, - u2_13=u2_12*d12+2*u1_12,u2_14=u2_13*d13+2*u1_13, - u2_15=u2_14*d14+2*u1_14; - const double v2_12=d13* 2+2*v1_13,v2_11=d12*v2_12+2*v1_12, - v2_10=d11*v2_11+2*v1_11,v2_09=d10*v2_10+2*v1_10, - v2_08=d09*v2_09+2*v1_09,v2_07=d08*v2_08+2*v1_08, - v2_06=d07*v2_07+2*v1_07,v2_05=d06*v2_06+2*v1_06, - v2_04=d05*v2_05+2*v1_05,v2_03=d04*v2_04+2*v1_04, - v2_02=d03*v2_03+2*v1_03,v2_01=d02*v2_02+2*v1_02, - v2_00=d01*v2_01+2*v1_01; - p[2*16+ 0]=4*w[ 0]*( + 1*v2_00); - p[2*16+ 1]=4*w[ 1]*( +2* 1*v1_01+u0_01*v2_01); - p[2*16+ 2]=4*w[ 2]*( 2*v0_02+2*u1_02*v1_02+u0_02*v2_02); - p[2*16+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03); - p[2*16+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04); - p[2*16+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05); - p[2*16+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06); - p[2*16+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07); - p[2*16+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08); - p[2*16+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09*v2_09); - p[2*16+10]=4*w[10]*(u2_10*v0_10+2*u1_10*v1_10+u0_10*v2_10); - p[2*16+11]=4*w[11]*(u2_11*v0_11+2*u1_11*v1_11+u0_11*v2_11); - p[2*16+12]=4*w[12]*(u2_12*v0_12+2*u1_12*v1_12+u0_12*v2_12); - p[2*16+13]=4*w[13]*(u2_13*v0_13+2*u1_13*v1_13+u0_13* 2); - p[2*16+14]=4*w[14]*(u2_14*v0_14+2*u1_14* 1 ); - p[2*16+15]=4*w[15]*(u2_15* 1 ); - } - } -} - -static const double gllz_17[ 7] = { - 0.97313217663141831415697950187372143058895914912251, - 0.91087999591557359562380250639772646753087945186873, - 0.81569625122177030710675055323752665471640239706712, - 0.69102898062768470539491935737245329680641306219042, - 0.54138539933010153912373340750406325167514664796483, - 0.37217443356547704190723468073525781255981731440028, - 0.1895119735183173883042630147531139713449924229225 -}; - -static void gll_lag_17(double *restrict p, double *restrict w, - unsigned n, int d, double xh) -{ - const double x = xh*2; - const double d00=x+2 ,d01=x+2*gllz_17[ 0],d02=x+2*gllz_17[ 1], - d03=x+2*gllz_17[ 2],d04=x+2*gllz_17[ 3],d05=x+2*gllz_17[ 4], - d06=x+2*gllz_17[ 5],d07=x+2*gllz_17[ 6],d08=x , - d09=x-2*gllz_17[ 6],d10=x-2*gllz_17[ 5],d11=x-2*gllz_17[ 4], - d12=x-2*gllz_17[ 3],d13=x-2*gllz_17[ 2],d14=x-2*gllz_17[ 1], - d15=x-2*gllz_17[ 0],d16=x-2 ; - const double u0_01= 1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02, - u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05, - u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08, - u0_10=u0_09*d09,u0_11=u0_10*d10,u0_12=u0_11*d11, - u0_13=u0_12*d12,u0_14=u0_13*d13,u0_15=u0_14*d14, - u0_16=u0_15*d15; - const double v0_15=d16* 1,v0_14=d15*v0_15,v0_13=d14*v0_14, - v0_12=d13*v0_13,v0_11=d12*v0_12,v0_10=d11*v0_11, - v0_09=d10*v0_10,v0_08=d09*v0_09,v0_07=d08*v0_08, - v0_06=d07*v0_07,v0_05=d06*v0_06,v0_04=d05*v0_05, - v0_03=d04*v0_04,v0_02=d03*v0_03,v0_01=d02*v0_02, - v0_00=d01*v0_01; - p[ 0]=w[ 0]* 1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02; - p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05; - p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08; - p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11*v0_11; - p[12]=w[12]*u0_12*v0_12; p[13]=w[13]*u0_13*v0_13; p[14]=w[14]*u0_14*v0_14; - p[15]=w[15]*u0_15*v0_15; p[16]=w[16]*u0_16* 1; - if(d>0) { - const double u1_02= 1*d01+u0_01,u1_03=u1_02*d02+u0_02, - u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04, - u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06, - u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08, - u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10, - u1_12=u1_11*d11+u0_11,u1_13=u1_12*d12+u0_12, - u1_14=u1_13*d13+u0_13,u1_15=u1_14*d14+u0_14, - u1_16=u1_15*d15+u0_15; - const double v1_14=d15* 1+v0_15,v1_13=d14*v1_14+v0_14, - v1_12=d13*v1_13+v0_13,v1_11=d12*v1_12+v0_12, - v1_10=d11*v1_11+v0_11,v1_09=d10*v1_10+v0_10, - v1_08=d09*v1_09+v0_09,v1_07=d08*v1_08+v0_08, - v1_06=d07*v1_07+v0_07,v1_05=d06*v1_06+v0_06, - v1_04=d05*v1_05+v0_05,v1_03=d04*v1_04+v0_04, - v1_02=d03*v1_03+v0_03,v1_01=d02*v1_02+v0_02, - v1_00=d01*v1_01+v0_01; - p[17+ 0]=2*w[ 0]*( v1_00); - p[17+ 1]=2*w[ 1]*( 1*v0_01+u0_01*v1_01); - p[17+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02); - p[17+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03); - p[17+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04); - p[17+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05); - p[17+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06); - p[17+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07); - p[17+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08); - p[17+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09); - p[17+10]=2*w[10]*(u1_10*v0_10+u0_10*v1_10); - p[17+11]=2*w[11]*(u1_11*v0_11+u0_11*v1_11); - p[17+12]=2*w[12]*(u1_12*v0_12+u0_12*v1_12); - p[17+13]=2*w[13]*(u1_13*v0_13+u0_13*v1_13); - p[17+14]=2*w[14]*(u1_14*v0_14+u0_14*v1_14); - p[17+15]=2*w[15]*(u1_15*v0_15+u0_15* 1); - p[17+16]=2*w[16]*(u1_16 ); - if(d>1) { - const double u2_03= 2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03, - u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05, - u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07, - u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09, - u2_11=u2_10*d10+2*u1_10,u2_12=u2_11*d11+2*u1_11, - u2_13=u2_12*d12+2*u1_12,u2_14=u2_13*d13+2*u1_13, - u2_15=u2_14*d14+2*u1_14,u2_16=u2_15*d15+2*u1_15; - const double v2_13=d14* 2+2*v1_14,v2_12=d13*v2_13+2*v1_13, - v2_11=d12*v2_12+2*v1_12,v2_10=d11*v2_11+2*v1_11, - v2_09=d10*v2_10+2*v1_10,v2_08=d09*v2_09+2*v1_09, - v2_07=d08*v2_08+2*v1_08,v2_06=d07*v2_07+2*v1_07, - v2_05=d06*v2_06+2*v1_06,v2_04=d05*v2_05+2*v1_05, - v2_03=d04*v2_04+2*v1_04,v2_02=d03*v2_03+2*v1_03, - v2_01=d02*v2_02+2*v1_02,v2_00=d01*v2_01+2*v1_01; - p[2*17+ 0]=4*w[ 0]*( + 1*v2_00); - p[2*17+ 1]=4*w[ 1]*( +2* 1*v1_01+u0_01*v2_01); - p[2*17+ 2]=4*w[ 2]*( 2*v0_02+2*u1_02*v1_02+u0_02*v2_02); - p[2*17+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03); - p[2*17+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04); - p[2*17+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05); - p[2*17+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06); - p[2*17+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07); - p[2*17+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08); - p[2*17+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09*v2_09); - p[2*17+10]=4*w[10]*(u2_10*v0_10+2*u1_10*v1_10+u0_10*v2_10); - p[2*17+11]=4*w[11]*(u2_11*v0_11+2*u1_11*v1_11+u0_11*v2_11); - p[2*17+12]=4*w[12]*(u2_12*v0_12+2*u1_12*v1_12+u0_12*v2_12); - p[2*17+13]=4*w[13]*(u2_13*v0_13+2*u1_13*v1_13+u0_13*v2_13); - p[2*17+14]=4*w[14]*(u2_14*v0_14+2*u1_14*v1_14+u0_14* 2); - p[2*17+15]=4*w[15]*(u2_15*v0_15+2*u1_15* 1 ); - p[2*17+16]=4*w[16]*(u2_16* 1 ); - } - } -} - -static const double gllz_18[ 8] = { - 0.97610555741219854286451892434170006676181344271919, - 0.92064918534753387383785462543127742356235348618904, - 0.83559353521809021371364636232793725743367075916582, - 0.72367932928324268130621036530207067914952520415476, - 0.58850483431866176117353589319355946900083678931622, - 0.43441503691212397534228713674067479584975844516369, - 0.26636265287828098416766533202559594206513618931826, - 0.089749093484652111022645010088561734960603901041125 -}; - -static void gll_lag_18(double *restrict p, double *restrict w, - unsigned n, int d, double xh) -{ - const double x = xh*2; - const double d00=x+2 ,d01=x+2*gllz_18[ 0],d02=x+2*gllz_18[ 1], - d03=x+2*gllz_18[ 2],d04=x+2*gllz_18[ 3],d05=x+2*gllz_18[ 4], - d06=x+2*gllz_18[ 5],d07=x+2*gllz_18[ 6],d08=x+2*gllz_18[ 7], - d09=x-2*gllz_18[ 7],d10=x-2*gllz_18[ 6],d11=x-2*gllz_18[ 5], - d12=x-2*gllz_18[ 4],d13=x-2*gllz_18[ 3],d14=x-2*gllz_18[ 2], - d15=x-2*gllz_18[ 1],d16=x-2*gllz_18[ 0],d17=x-2 ; - const double u0_01= 1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02, - u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05, - u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08, - u0_10=u0_09*d09,u0_11=u0_10*d10,u0_12=u0_11*d11, - u0_13=u0_12*d12,u0_14=u0_13*d13,u0_15=u0_14*d14, - u0_16=u0_15*d15,u0_17=u0_16*d16; - const double v0_16=d17* 1,v0_15=d16*v0_16,v0_14=d15*v0_15, - v0_13=d14*v0_14,v0_12=d13*v0_13,v0_11=d12*v0_12, - v0_10=d11*v0_11,v0_09=d10*v0_10,v0_08=d09*v0_09, - v0_07=d08*v0_08,v0_06=d07*v0_07,v0_05=d06*v0_06, - v0_04=d05*v0_05,v0_03=d04*v0_04,v0_02=d03*v0_03, - v0_01=d02*v0_02,v0_00=d01*v0_01; - p[ 0]=w[ 0]* 1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02; - p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05; - p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08; - p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11*v0_11; - p[12]=w[12]*u0_12*v0_12; p[13]=w[13]*u0_13*v0_13; p[14]=w[14]*u0_14*v0_14; - p[15]=w[15]*u0_15*v0_15; p[16]=w[16]*u0_16*v0_16; p[17]=w[17]*u0_17* 1; - if(d>0) { - const double u1_02= 1*d01+u0_01,u1_03=u1_02*d02+u0_02, - u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04, - u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06, - u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08, - u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10, - u1_12=u1_11*d11+u0_11,u1_13=u1_12*d12+u0_12, - u1_14=u1_13*d13+u0_13,u1_15=u1_14*d14+u0_14, - u1_16=u1_15*d15+u0_15,u1_17=u1_16*d16+u0_16; - const double v1_15=d16* 1+v0_16,v1_14=d15*v1_15+v0_15, - v1_13=d14*v1_14+v0_14,v1_12=d13*v1_13+v0_13, - v1_11=d12*v1_12+v0_12,v1_10=d11*v1_11+v0_11, - v1_09=d10*v1_10+v0_10,v1_08=d09*v1_09+v0_09, - v1_07=d08*v1_08+v0_08,v1_06=d07*v1_07+v0_07, - v1_05=d06*v1_06+v0_06,v1_04=d05*v1_05+v0_05, - v1_03=d04*v1_04+v0_04,v1_02=d03*v1_03+v0_03, - v1_01=d02*v1_02+v0_02,v1_00=d01*v1_01+v0_01; - p[18+ 0]=2*w[ 0]*( v1_00); - p[18+ 1]=2*w[ 1]*( 1*v0_01+u0_01*v1_01); - p[18+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02); - p[18+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03); - p[18+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04); - p[18+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05); - p[18+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06); - p[18+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07); - p[18+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08); - p[18+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09); - p[18+10]=2*w[10]*(u1_10*v0_10+u0_10*v1_10); - p[18+11]=2*w[11]*(u1_11*v0_11+u0_11*v1_11); - p[18+12]=2*w[12]*(u1_12*v0_12+u0_12*v1_12); - p[18+13]=2*w[13]*(u1_13*v0_13+u0_13*v1_13); - p[18+14]=2*w[14]*(u1_14*v0_14+u0_14*v1_14); - p[18+15]=2*w[15]*(u1_15*v0_15+u0_15*v1_15); - p[18+16]=2*w[16]*(u1_16*v0_16+u0_16* 1); - p[18+17]=2*w[17]*(u1_17 ); - if(d>1) { - const double u2_03= 2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03, - u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05, - u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07, - u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09, - u2_11=u2_10*d10+2*u1_10,u2_12=u2_11*d11+2*u1_11, - u2_13=u2_12*d12+2*u1_12,u2_14=u2_13*d13+2*u1_13, - u2_15=u2_14*d14+2*u1_14,u2_16=u2_15*d15+2*u1_15, - u2_17=u2_16*d16+2*u1_16; - const double v2_14=d15* 2+2*v1_15,v2_13=d14*v2_14+2*v1_14, - v2_12=d13*v2_13+2*v1_13,v2_11=d12*v2_12+2*v1_12, - v2_10=d11*v2_11+2*v1_11,v2_09=d10*v2_10+2*v1_10, - v2_08=d09*v2_09+2*v1_09,v2_07=d08*v2_08+2*v1_08, - v2_06=d07*v2_07+2*v1_07,v2_05=d06*v2_06+2*v1_06, - v2_04=d05*v2_05+2*v1_05,v2_03=d04*v2_04+2*v1_04, - v2_02=d03*v2_03+2*v1_03,v2_01=d02*v2_02+2*v1_02, - v2_00=d01*v2_01+2*v1_01; - p[2*18+ 0]=4*w[ 0]*( + 1*v2_00); - p[2*18+ 1]=4*w[ 1]*( +2* 1*v1_01+u0_01*v2_01); - p[2*18+ 2]=4*w[ 2]*( 2*v0_02+2*u1_02*v1_02+u0_02*v2_02); - p[2*18+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03); - p[2*18+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04); - p[2*18+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05); - p[2*18+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06); - p[2*18+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07); - p[2*18+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08); - p[2*18+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09*v2_09); - p[2*18+10]=4*w[10]*(u2_10*v0_10+2*u1_10*v1_10+u0_10*v2_10); - p[2*18+11]=4*w[11]*(u2_11*v0_11+2*u1_11*v1_11+u0_11*v2_11); - p[2*18+12]=4*w[12]*(u2_12*v0_12+2*u1_12*v1_12+u0_12*v2_12); - p[2*18+13]=4*w[13]*(u2_13*v0_13+2*u1_13*v1_13+u0_13*v2_13); - p[2*18+14]=4*w[14]*(u2_14*v0_14+2*u1_14*v1_14+u0_14*v2_14); - p[2*18+15]=4*w[15]*(u2_15*v0_15+2*u1_15*v1_15+u0_15* 2); - p[2*18+16]=4*w[16]*(u2_16*v0_16+2*u1_16* 1 ); - p[2*18+17]=4*w[17]*(u2_17* 1 ); - } - } -} - -static const double gllz_19[ 8] = { - 0.97861176622208009515263406311022256281427733781081, - 0.92890152815258624371794025879654861245016818225195, - 0.85246057779664609308595597004106262523709538083887, - 0.7514942025526130141636374896339440404036593556658, - 0.62890813726522049776683230622873254706861115718956, - 0.48822928568071350277790963762492336977121559965148, - 0.33350484782449861029850010384492701192296337547773, - 0.16918602340928157137515415344488042375289555076585 -}; - -static void gll_lag_19(double *restrict p, double *restrict w, - unsigned n, int d, double xh) -{ - const double x = xh*2; - const double d00=x+2 ,d01=x+2*gllz_19[ 0],d02=x+2*gllz_19[ 1], - d03=x+2*gllz_19[ 2],d04=x+2*gllz_19[ 3],d05=x+2*gllz_19[ 4], - d06=x+2*gllz_19[ 5],d07=x+2*gllz_19[ 6],d08=x+2*gllz_19[ 7], - d09=x ,d10=x-2*gllz_19[ 7],d11=x-2*gllz_19[ 6], - d12=x-2*gllz_19[ 5],d13=x-2*gllz_19[ 4],d14=x-2*gllz_19[ 3], - d15=x-2*gllz_19[ 2],d16=x-2*gllz_19[ 1],d17=x-2*gllz_19[ 0], - d18=x-2 ; - const double u0_01= 1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02, - u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05, - u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08, - u0_10=u0_09*d09,u0_11=u0_10*d10,u0_12=u0_11*d11, - u0_13=u0_12*d12,u0_14=u0_13*d13,u0_15=u0_14*d14, - u0_16=u0_15*d15,u0_17=u0_16*d16,u0_18=u0_17*d17; - const double v0_17=d18* 1,v0_16=d17*v0_17,v0_15=d16*v0_16, - v0_14=d15*v0_15,v0_13=d14*v0_14,v0_12=d13*v0_13, - v0_11=d12*v0_12,v0_10=d11*v0_11,v0_09=d10*v0_10, - v0_08=d09*v0_09,v0_07=d08*v0_08,v0_06=d07*v0_07, - v0_05=d06*v0_06,v0_04=d05*v0_05,v0_03=d04*v0_04, - v0_02=d03*v0_03,v0_01=d02*v0_02,v0_00=d01*v0_01; - p[ 0]=w[ 0]* 1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02; - p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05; - p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08; - p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11*v0_11; - p[12]=w[12]*u0_12*v0_12; p[13]=w[13]*u0_13*v0_13; p[14]=w[14]*u0_14*v0_14; - p[15]=w[15]*u0_15*v0_15; p[16]=w[16]*u0_16*v0_16; p[17]=w[17]*u0_17*v0_17; - p[18]=w[18]*u0_18* 1; - if(d>0) { - const double u1_02= 1*d01+u0_01,u1_03=u1_02*d02+u0_02, - u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04, - u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06, - u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08, - u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10, - u1_12=u1_11*d11+u0_11,u1_13=u1_12*d12+u0_12, - u1_14=u1_13*d13+u0_13,u1_15=u1_14*d14+u0_14, - u1_16=u1_15*d15+u0_15,u1_17=u1_16*d16+u0_16, - u1_18=u1_17*d17+u0_17; - const double v1_16=d17* 1+v0_17,v1_15=d16*v1_16+v0_16, - v1_14=d15*v1_15+v0_15,v1_13=d14*v1_14+v0_14, - v1_12=d13*v1_13+v0_13,v1_11=d12*v1_12+v0_12, - v1_10=d11*v1_11+v0_11,v1_09=d10*v1_10+v0_10, - v1_08=d09*v1_09+v0_09,v1_07=d08*v1_08+v0_08, - v1_06=d07*v1_07+v0_07,v1_05=d06*v1_06+v0_06, - v1_04=d05*v1_05+v0_05,v1_03=d04*v1_04+v0_04, - v1_02=d03*v1_03+v0_03,v1_01=d02*v1_02+v0_02, - v1_00=d01*v1_01+v0_01; - p[19+ 0]=2*w[ 0]*( v1_00); - p[19+ 1]=2*w[ 1]*( 1*v0_01+u0_01*v1_01); - p[19+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02); - p[19+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03); - p[19+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04); - p[19+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05); - p[19+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06); - p[19+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07); - p[19+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08); - p[19+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09); - p[19+10]=2*w[10]*(u1_10*v0_10+u0_10*v1_10); - p[19+11]=2*w[11]*(u1_11*v0_11+u0_11*v1_11); - p[19+12]=2*w[12]*(u1_12*v0_12+u0_12*v1_12); - p[19+13]=2*w[13]*(u1_13*v0_13+u0_13*v1_13); - p[19+14]=2*w[14]*(u1_14*v0_14+u0_14*v1_14); - p[19+15]=2*w[15]*(u1_15*v0_15+u0_15*v1_15); - p[19+16]=2*w[16]*(u1_16*v0_16+u0_16*v1_16); - p[19+17]=2*w[17]*(u1_17*v0_17+u0_17* 1); - p[19+18]=2*w[18]*(u1_18 ); - if(d>1) { - const double u2_03= 2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03, - u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05, - u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07, - u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09, - u2_11=u2_10*d10+2*u1_10,u2_12=u2_11*d11+2*u1_11, - u2_13=u2_12*d12+2*u1_12,u2_14=u2_13*d13+2*u1_13, - u2_15=u2_14*d14+2*u1_14,u2_16=u2_15*d15+2*u1_15, - u2_17=u2_16*d16+2*u1_16,u2_18=u2_17*d17+2*u1_17; - const double v2_15=d16* 2+2*v1_16,v2_14=d15*v2_15+2*v1_15, - v2_13=d14*v2_14+2*v1_14,v2_12=d13*v2_13+2*v1_13, - v2_11=d12*v2_12+2*v1_12,v2_10=d11*v2_11+2*v1_11, - v2_09=d10*v2_10+2*v1_10,v2_08=d09*v2_09+2*v1_09, - v2_07=d08*v2_08+2*v1_08,v2_06=d07*v2_07+2*v1_07, - v2_05=d06*v2_06+2*v1_06,v2_04=d05*v2_05+2*v1_05, - v2_03=d04*v2_04+2*v1_04,v2_02=d03*v2_03+2*v1_03, - v2_01=d02*v2_02+2*v1_02,v2_00=d01*v2_01+2*v1_01; - p[2*19+ 0]=4*w[ 0]*( + 1*v2_00); - p[2*19+ 1]=4*w[ 1]*( +2* 1*v1_01+u0_01*v2_01); - p[2*19+ 2]=4*w[ 2]*( 2*v0_02+2*u1_02*v1_02+u0_02*v2_02); - p[2*19+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03); - p[2*19+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04); - p[2*19+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05); - p[2*19+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06); - p[2*19+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07); - p[2*19+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08); - p[2*19+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09*v2_09); - p[2*19+10]=4*w[10]*(u2_10*v0_10+2*u1_10*v1_10+u0_10*v2_10); - p[2*19+11]=4*w[11]*(u2_11*v0_11+2*u1_11*v1_11+u0_11*v2_11); - p[2*19+12]=4*w[12]*(u2_12*v0_12+2*u1_12*v1_12+u0_12*v2_12); - p[2*19+13]=4*w[13]*(u2_13*v0_13+2*u1_13*v1_13+u0_13*v2_13); - p[2*19+14]=4*w[14]*(u2_14*v0_14+2*u1_14*v1_14+u0_14*v2_14); - p[2*19+15]=4*w[15]*(u2_15*v0_15+2*u1_15*v1_15+u0_15*v2_15); - p[2*19+16]=4*w[16]*(u2_16*v0_16+2*u1_16*v1_16+u0_16* 2); - p[2*19+17]=4*w[17]*(u2_17*v0_17+2*u1_17* 1 ); - p[2*19+18]=4*w[18]*(u2_18* 1 ); - } - } -} - -static const double gllz_20[ 9] = { - 0.98074370489391417192544643858423091522991062312625, - 0.93593449881266543571618158493062692991557383318105, - 0.86687797808995014130984721461628521396291128831699, - 0.77536826095205587041431752759469134337272185947653, - 0.66377640229031128984640332297115885247574574199149, - 0.53499286403188626164813596182898398300685156913752, - 0.39235318371390929938647470381582436666520332929891, - 0.23955170592298649518240135692708807194151780992738, - 0.080545937238821837975944518159554463022392870092908 -}; - -static void gll_lag_20(double *restrict p, double *restrict w, - unsigned n, int d, double xh) -{ - const double x = xh*2; - const double d00=x+2 ,d01=x+2*gllz_20[ 0],d02=x+2*gllz_20[ 1], - d03=x+2*gllz_20[ 2],d04=x+2*gllz_20[ 3],d05=x+2*gllz_20[ 4], - d06=x+2*gllz_20[ 5],d07=x+2*gllz_20[ 6],d08=x+2*gllz_20[ 7], - d09=x+2*gllz_20[ 8],d10=x-2*gllz_20[ 8],d11=x-2*gllz_20[ 7], - d12=x-2*gllz_20[ 6],d13=x-2*gllz_20[ 5],d14=x-2*gllz_20[ 4], - d15=x-2*gllz_20[ 3],d16=x-2*gllz_20[ 2],d17=x-2*gllz_20[ 1], - d18=x-2*gllz_20[ 0],d19=x-2 ; - const double u0_01= 1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02, - u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05, - u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08, - u0_10=u0_09*d09,u0_11=u0_10*d10,u0_12=u0_11*d11, - u0_13=u0_12*d12,u0_14=u0_13*d13,u0_15=u0_14*d14, - u0_16=u0_15*d15,u0_17=u0_16*d16,u0_18=u0_17*d17, - u0_19=u0_18*d18; - const double v0_18=d19* 1,v0_17=d18*v0_18,v0_16=d17*v0_17, - v0_15=d16*v0_16,v0_14=d15*v0_15,v0_13=d14*v0_14, - v0_12=d13*v0_13,v0_11=d12*v0_12,v0_10=d11*v0_11, - v0_09=d10*v0_10,v0_08=d09*v0_09,v0_07=d08*v0_08, - v0_06=d07*v0_07,v0_05=d06*v0_06,v0_04=d05*v0_05, - v0_03=d04*v0_04,v0_02=d03*v0_03,v0_01=d02*v0_02, - v0_00=d01*v0_01; - p[ 0]=w[ 0]* 1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02; - p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05; - p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08; - p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11*v0_11; - p[12]=w[12]*u0_12*v0_12; p[13]=w[13]*u0_13*v0_13; p[14]=w[14]*u0_14*v0_14; - p[15]=w[15]*u0_15*v0_15; p[16]=w[16]*u0_16*v0_16; p[17]=w[17]*u0_17*v0_17; - p[18]=w[18]*u0_18*v0_18; p[19]=w[19]*u0_19* 1; - if(d>0) { - const double u1_02= 1*d01+u0_01,u1_03=u1_02*d02+u0_02, - u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04, - u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06, - u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08, - u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10, - u1_12=u1_11*d11+u0_11,u1_13=u1_12*d12+u0_12, - u1_14=u1_13*d13+u0_13,u1_15=u1_14*d14+u0_14, - u1_16=u1_15*d15+u0_15,u1_17=u1_16*d16+u0_16, - u1_18=u1_17*d17+u0_17,u1_19=u1_18*d18+u0_18; - const double v1_17=d18* 1+v0_18,v1_16=d17*v1_17+v0_17, - v1_15=d16*v1_16+v0_16,v1_14=d15*v1_15+v0_15, - v1_13=d14*v1_14+v0_14,v1_12=d13*v1_13+v0_13, - v1_11=d12*v1_12+v0_12,v1_10=d11*v1_11+v0_11, - v1_09=d10*v1_10+v0_10,v1_08=d09*v1_09+v0_09, - v1_07=d08*v1_08+v0_08,v1_06=d07*v1_07+v0_07, - v1_05=d06*v1_06+v0_06,v1_04=d05*v1_05+v0_05, - v1_03=d04*v1_04+v0_04,v1_02=d03*v1_03+v0_03, - v1_01=d02*v1_02+v0_02,v1_00=d01*v1_01+v0_01; - p[20+ 0]=2*w[ 0]*( v1_00); - p[20+ 1]=2*w[ 1]*( 1*v0_01+u0_01*v1_01); - p[20+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02); - p[20+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03); - p[20+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04); - p[20+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05); - p[20+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06); - p[20+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07); - p[20+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08); - p[20+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09); - p[20+10]=2*w[10]*(u1_10*v0_10+u0_10*v1_10); - p[20+11]=2*w[11]*(u1_11*v0_11+u0_11*v1_11); - p[20+12]=2*w[12]*(u1_12*v0_12+u0_12*v1_12); - p[20+13]=2*w[13]*(u1_13*v0_13+u0_13*v1_13); - p[20+14]=2*w[14]*(u1_14*v0_14+u0_14*v1_14); - p[20+15]=2*w[15]*(u1_15*v0_15+u0_15*v1_15); - p[20+16]=2*w[16]*(u1_16*v0_16+u0_16*v1_16); - p[20+17]=2*w[17]*(u1_17*v0_17+u0_17*v1_17); - p[20+18]=2*w[18]*(u1_18*v0_18+u0_18* 1); - p[20+19]=2*w[19]*(u1_19 ); - if(d>1) { - const double u2_03= 2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03, - u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05, - u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07, - u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09, - u2_11=u2_10*d10+2*u1_10,u2_12=u2_11*d11+2*u1_11, - u2_13=u2_12*d12+2*u1_12,u2_14=u2_13*d13+2*u1_13, - u2_15=u2_14*d14+2*u1_14,u2_16=u2_15*d15+2*u1_15, - u2_17=u2_16*d16+2*u1_16,u2_18=u2_17*d17+2*u1_17, - u2_19=u2_18*d18+2*u1_18; - const double v2_16=d17* 2+2*v1_17,v2_15=d16*v2_16+2*v1_16, - v2_14=d15*v2_15+2*v1_15,v2_13=d14*v2_14+2*v1_14, - v2_12=d13*v2_13+2*v1_13,v2_11=d12*v2_12+2*v1_12, - v2_10=d11*v2_11+2*v1_11,v2_09=d10*v2_10+2*v1_10, - v2_08=d09*v2_09+2*v1_09,v2_07=d08*v2_08+2*v1_08, - v2_06=d07*v2_07+2*v1_07,v2_05=d06*v2_06+2*v1_06, - v2_04=d05*v2_05+2*v1_05,v2_03=d04*v2_04+2*v1_04, - v2_02=d03*v2_03+2*v1_03,v2_01=d02*v2_02+2*v1_02, - v2_00=d01*v2_01+2*v1_01; - p[2*20+ 0]=4*w[ 0]*( + 1*v2_00); - p[2*20+ 1]=4*w[ 1]*( +2* 1*v1_01+u0_01*v2_01); - p[2*20+ 2]=4*w[ 2]*( 2*v0_02+2*u1_02*v1_02+u0_02*v2_02); - p[2*20+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03); - p[2*20+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04); - p[2*20+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05); - p[2*20+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06); - p[2*20+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07); - p[2*20+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08); - p[2*20+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09*v2_09); - p[2*20+10]=4*w[10]*(u2_10*v0_10+2*u1_10*v1_10+u0_10*v2_10); - p[2*20+11]=4*w[11]*(u2_11*v0_11+2*u1_11*v1_11+u0_11*v2_11); - p[2*20+12]=4*w[12]*(u2_12*v0_12+2*u1_12*v1_12+u0_12*v2_12); - p[2*20+13]=4*w[13]*(u2_13*v0_13+2*u1_13*v1_13+u0_13*v2_13); - p[2*20+14]=4*w[14]*(u2_14*v0_14+2*u1_14*v1_14+u0_14*v2_14); - p[2*20+15]=4*w[15]*(u2_15*v0_15+2*u1_15*v1_15+u0_15*v2_15); - p[2*20+16]=4*w[16]*(u2_16*v0_16+2*u1_16*v1_16+u0_16*v2_16); - p[2*20+17]=4*w[17]*(u2_17*v0_17+2*u1_17*v1_17+u0_17* 2); - p[2*20+18]=4*w[18]*(u2_18*v0_18+2*u1_18* 1 ); - p[2*20+19]=4*w[19]*(u2_19* 1 ); - } - } -} - -static const double gllz_21[ 9] = { - 0.98257229660454802823448127655540587685917158823641, - 0.94197629695974553429610265066143517664965087404401, - 0.8792947553235904644511535963049440477105815515092, - 0.79600192607771240474431258966035863909041966054978, - 0.69405102606222323262731639319466662875771600610585, - 0.57583196026183068692702187033808528733577300855848, - 0.44411578327900210119451634960735128473505748656706, - 0.30198985650876488727535186785875223202107103406039, - 0.15278551580218546600635832848566943551774899331328 -}; - -static void gll_lag_21(double *restrict p, double *restrict w, - unsigned n, int d, double xh) -{ - const double x = xh*2; - const double d00=x+2 ,d01=x+2*gllz_21[ 0],d02=x+2*gllz_21[ 1], - d03=x+2*gllz_21[ 2],d04=x+2*gllz_21[ 3],d05=x+2*gllz_21[ 4], - d06=x+2*gllz_21[ 5],d07=x+2*gllz_21[ 6],d08=x+2*gllz_21[ 7], - d09=x+2*gllz_21[ 8],d10=x ,d11=x-2*gllz_21[ 8], - d12=x-2*gllz_21[ 7],d13=x-2*gllz_21[ 6],d14=x-2*gllz_21[ 5], - d15=x-2*gllz_21[ 4],d16=x-2*gllz_21[ 3],d17=x-2*gllz_21[ 2], - d18=x-2*gllz_21[ 1],d19=x-2*gllz_21[ 0],d20=x-2 ; - const double u0_01= 1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02, - u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05, - u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08, - u0_10=u0_09*d09,u0_11=u0_10*d10,u0_12=u0_11*d11, - u0_13=u0_12*d12,u0_14=u0_13*d13,u0_15=u0_14*d14, - u0_16=u0_15*d15,u0_17=u0_16*d16,u0_18=u0_17*d17, - u0_19=u0_18*d18,u0_20=u0_19*d19; - const double v0_19=d20* 1,v0_18=d19*v0_19,v0_17=d18*v0_18, - v0_16=d17*v0_17,v0_15=d16*v0_16,v0_14=d15*v0_15, - v0_13=d14*v0_14,v0_12=d13*v0_13,v0_11=d12*v0_12, - v0_10=d11*v0_11,v0_09=d10*v0_10,v0_08=d09*v0_09, - v0_07=d08*v0_08,v0_06=d07*v0_07,v0_05=d06*v0_06, - v0_04=d05*v0_05,v0_03=d04*v0_04,v0_02=d03*v0_03, - v0_01=d02*v0_02,v0_00=d01*v0_01; - p[ 0]=w[ 0]* 1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02; - p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05; - p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08; - p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11*v0_11; - p[12]=w[12]*u0_12*v0_12; p[13]=w[13]*u0_13*v0_13; p[14]=w[14]*u0_14*v0_14; - p[15]=w[15]*u0_15*v0_15; p[16]=w[16]*u0_16*v0_16; p[17]=w[17]*u0_17*v0_17; - p[18]=w[18]*u0_18*v0_18; p[19]=w[19]*u0_19*v0_19; p[20]=w[20]*u0_20* 1; - if(d>0) { - const double u1_02= 1*d01+u0_01,u1_03=u1_02*d02+u0_02, - u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04, - u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06, - u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08, - u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10, - u1_12=u1_11*d11+u0_11,u1_13=u1_12*d12+u0_12, - u1_14=u1_13*d13+u0_13,u1_15=u1_14*d14+u0_14, - u1_16=u1_15*d15+u0_15,u1_17=u1_16*d16+u0_16, - u1_18=u1_17*d17+u0_17,u1_19=u1_18*d18+u0_18, - u1_20=u1_19*d19+u0_19; - const double v1_18=d19* 1+v0_19,v1_17=d18*v1_18+v0_18, - v1_16=d17*v1_17+v0_17,v1_15=d16*v1_16+v0_16, - v1_14=d15*v1_15+v0_15,v1_13=d14*v1_14+v0_14, - v1_12=d13*v1_13+v0_13,v1_11=d12*v1_12+v0_12, - v1_10=d11*v1_11+v0_11,v1_09=d10*v1_10+v0_10, - v1_08=d09*v1_09+v0_09,v1_07=d08*v1_08+v0_08, - v1_06=d07*v1_07+v0_07,v1_05=d06*v1_06+v0_06, - v1_04=d05*v1_05+v0_05,v1_03=d04*v1_04+v0_04, - v1_02=d03*v1_03+v0_03,v1_01=d02*v1_02+v0_02, - v1_00=d01*v1_01+v0_01; - p[21+ 0]=2*w[ 0]*( v1_00); - p[21+ 1]=2*w[ 1]*( 1*v0_01+u0_01*v1_01); - p[21+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02); - p[21+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03); - p[21+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04); - p[21+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05); - p[21+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06); - p[21+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07); - p[21+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08); - p[21+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09); - p[21+10]=2*w[10]*(u1_10*v0_10+u0_10*v1_10); - p[21+11]=2*w[11]*(u1_11*v0_11+u0_11*v1_11); - p[21+12]=2*w[12]*(u1_12*v0_12+u0_12*v1_12); - p[21+13]=2*w[13]*(u1_13*v0_13+u0_13*v1_13); - p[21+14]=2*w[14]*(u1_14*v0_14+u0_14*v1_14); - p[21+15]=2*w[15]*(u1_15*v0_15+u0_15*v1_15); - p[21+16]=2*w[16]*(u1_16*v0_16+u0_16*v1_16); - p[21+17]=2*w[17]*(u1_17*v0_17+u0_17*v1_17); - p[21+18]=2*w[18]*(u1_18*v0_18+u0_18*v1_18); - p[21+19]=2*w[19]*(u1_19*v0_19+u0_19* 1); - p[21+20]=2*w[20]*(u1_20 ); - if(d>1) { - const double u2_03= 2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03, - u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05, - u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07, - u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09, - u2_11=u2_10*d10+2*u1_10,u2_12=u2_11*d11+2*u1_11, - u2_13=u2_12*d12+2*u1_12,u2_14=u2_13*d13+2*u1_13, - u2_15=u2_14*d14+2*u1_14,u2_16=u2_15*d15+2*u1_15, - u2_17=u2_16*d16+2*u1_16,u2_18=u2_17*d17+2*u1_17, - u2_19=u2_18*d18+2*u1_18,u2_20=u2_19*d19+2*u1_19; - const double v2_17=d18* 2+2*v1_18,v2_16=d17*v2_17+2*v1_17, - v2_15=d16*v2_16+2*v1_16,v2_14=d15*v2_15+2*v1_15, - v2_13=d14*v2_14+2*v1_14,v2_12=d13*v2_13+2*v1_13, - v2_11=d12*v2_12+2*v1_12,v2_10=d11*v2_11+2*v1_11, - v2_09=d10*v2_10+2*v1_10,v2_08=d09*v2_09+2*v1_09, - v2_07=d08*v2_08+2*v1_08,v2_06=d07*v2_07+2*v1_07, - v2_05=d06*v2_06+2*v1_06,v2_04=d05*v2_05+2*v1_05, - v2_03=d04*v2_04+2*v1_04,v2_02=d03*v2_03+2*v1_03, - v2_01=d02*v2_02+2*v1_02,v2_00=d01*v2_01+2*v1_01; - p[2*21+ 0]=4*w[ 0]*( + 1*v2_00); - p[2*21+ 1]=4*w[ 1]*( +2* 1*v1_01+u0_01*v2_01); - p[2*21+ 2]=4*w[ 2]*( 2*v0_02+2*u1_02*v1_02+u0_02*v2_02); - p[2*21+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03); - p[2*21+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04); - p[2*21+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05); - p[2*21+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06); - p[2*21+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07); - p[2*21+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08); - p[2*21+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09*v2_09); - p[2*21+10]=4*w[10]*(u2_10*v0_10+2*u1_10*v1_10+u0_10*v2_10); - p[2*21+11]=4*w[11]*(u2_11*v0_11+2*u1_11*v1_11+u0_11*v2_11); - p[2*21+12]=4*w[12]*(u2_12*v0_12+2*u1_12*v1_12+u0_12*v2_12); - p[2*21+13]=4*w[13]*(u2_13*v0_13+2*u1_13*v1_13+u0_13*v2_13); - p[2*21+14]=4*w[14]*(u2_14*v0_14+2*u1_14*v1_14+u0_14*v2_14); - p[2*21+15]=4*w[15]*(u2_15*v0_15+2*u1_15*v1_15+u0_15*v2_15); - p[2*21+16]=4*w[16]*(u2_16*v0_16+2*u1_16*v1_16+u0_16*v2_16); - p[2*21+17]=4*w[17]*(u2_17*v0_17+2*u1_17*v1_17+u0_17*v2_17); - p[2*21+18]=4*w[18]*(u2_18*v0_18+2*u1_18*v1_18+u0_18* 2); - p[2*21+19]=4*w[19]*(u2_19*v0_19+2*u1_19* 1 ); - p[2*21+20]=4*w[20]*(u2_20* 1 ); - } - } -} - -static const double gllz_22[10] = { - 0.98415243845764617655228962221207029660551353611952, - 0.94720428399922868052421376661572950991206204534136, - 0.89006229019090447052965782577908679019953408284715, - 0.8139489276119211360454418480561350424386685149071, - 0.7204872399612021581198818963984657585933454261195, - 0.6116694382842589712262116058699265993454403046077, - 0.48981487518990234980875123568327004167127163579515, - 0.35752071013891953806095728024017912928330710394294, - 0.21760658515928504178795509346539276327500669401419, - 0.073054540010898334761088790464107356192779236333516 -}; - -static void gll_lag_22(double *restrict p, double *restrict w, - unsigned n, int d, double xh) -{ - const double x = xh*2; - const double d00=x+2 ,d01=x+2*gllz_22[ 0],d02=x+2*gllz_22[ 1], - d03=x+2*gllz_22[ 2],d04=x+2*gllz_22[ 3],d05=x+2*gllz_22[ 4], - d06=x+2*gllz_22[ 5],d07=x+2*gllz_22[ 6],d08=x+2*gllz_22[ 7], - d09=x+2*gllz_22[ 8],d10=x+2*gllz_22[ 9],d11=x-2*gllz_22[ 9], - d12=x-2*gllz_22[ 8],d13=x-2*gllz_22[ 7],d14=x-2*gllz_22[ 6], - d15=x-2*gllz_22[ 5],d16=x-2*gllz_22[ 4],d17=x-2*gllz_22[ 3], - d18=x-2*gllz_22[ 2],d19=x-2*gllz_22[ 1],d20=x-2*gllz_22[ 0], - d21=x-2 ; - const double u0_01= 1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02, - u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05, - u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08, - u0_10=u0_09*d09,u0_11=u0_10*d10,u0_12=u0_11*d11, - u0_13=u0_12*d12,u0_14=u0_13*d13,u0_15=u0_14*d14, - u0_16=u0_15*d15,u0_17=u0_16*d16,u0_18=u0_17*d17, - u0_19=u0_18*d18,u0_20=u0_19*d19,u0_21=u0_20*d20; - const double v0_20=d21* 1,v0_19=d20*v0_20,v0_18=d19*v0_19, - v0_17=d18*v0_18,v0_16=d17*v0_17,v0_15=d16*v0_16, - v0_14=d15*v0_15,v0_13=d14*v0_14,v0_12=d13*v0_13, - v0_11=d12*v0_12,v0_10=d11*v0_11,v0_09=d10*v0_10, - v0_08=d09*v0_09,v0_07=d08*v0_08,v0_06=d07*v0_07, - v0_05=d06*v0_06,v0_04=d05*v0_05,v0_03=d04*v0_04, - v0_02=d03*v0_03,v0_01=d02*v0_02,v0_00=d01*v0_01; - p[ 0]=w[ 0]* 1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02; - p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05; - p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08; - p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11*v0_11; - p[12]=w[12]*u0_12*v0_12; p[13]=w[13]*u0_13*v0_13; p[14]=w[14]*u0_14*v0_14; - p[15]=w[15]*u0_15*v0_15; p[16]=w[16]*u0_16*v0_16; p[17]=w[17]*u0_17*v0_17; - p[18]=w[18]*u0_18*v0_18; p[19]=w[19]*u0_19*v0_19; p[20]=w[20]*u0_20*v0_20; - p[21]=w[21]*u0_21* 1; - if(d>0) { - const double u1_02= 1*d01+u0_01,u1_03=u1_02*d02+u0_02, - u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04, - u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06, - u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08, - u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10, - u1_12=u1_11*d11+u0_11,u1_13=u1_12*d12+u0_12, - u1_14=u1_13*d13+u0_13,u1_15=u1_14*d14+u0_14, - u1_16=u1_15*d15+u0_15,u1_17=u1_16*d16+u0_16, - u1_18=u1_17*d17+u0_17,u1_19=u1_18*d18+u0_18, - u1_20=u1_19*d19+u0_19,u1_21=u1_20*d20+u0_20; - const double v1_19=d20* 1+v0_20,v1_18=d19*v1_19+v0_19, - v1_17=d18*v1_18+v0_18,v1_16=d17*v1_17+v0_17, - v1_15=d16*v1_16+v0_16,v1_14=d15*v1_15+v0_15, - v1_13=d14*v1_14+v0_14,v1_12=d13*v1_13+v0_13, - v1_11=d12*v1_12+v0_12,v1_10=d11*v1_11+v0_11, - v1_09=d10*v1_10+v0_10,v1_08=d09*v1_09+v0_09, - v1_07=d08*v1_08+v0_08,v1_06=d07*v1_07+v0_07, - v1_05=d06*v1_06+v0_06,v1_04=d05*v1_05+v0_05, - v1_03=d04*v1_04+v0_04,v1_02=d03*v1_03+v0_03, - v1_01=d02*v1_02+v0_02,v1_00=d01*v1_01+v0_01; - p[22+ 0]=2*w[ 0]*( v1_00); - p[22+ 1]=2*w[ 1]*( 1*v0_01+u0_01*v1_01); - p[22+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02); - p[22+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03); - p[22+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04); - p[22+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05); - p[22+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06); - p[22+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07); - p[22+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08); - p[22+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09); - p[22+10]=2*w[10]*(u1_10*v0_10+u0_10*v1_10); - p[22+11]=2*w[11]*(u1_11*v0_11+u0_11*v1_11); - p[22+12]=2*w[12]*(u1_12*v0_12+u0_12*v1_12); - p[22+13]=2*w[13]*(u1_13*v0_13+u0_13*v1_13); - p[22+14]=2*w[14]*(u1_14*v0_14+u0_14*v1_14); - p[22+15]=2*w[15]*(u1_15*v0_15+u0_15*v1_15); - p[22+16]=2*w[16]*(u1_16*v0_16+u0_16*v1_16); - p[22+17]=2*w[17]*(u1_17*v0_17+u0_17*v1_17); - p[22+18]=2*w[18]*(u1_18*v0_18+u0_18*v1_18); - p[22+19]=2*w[19]*(u1_19*v0_19+u0_19*v1_19); - p[22+20]=2*w[20]*(u1_20*v0_20+u0_20* 1); - p[22+21]=2*w[21]*(u1_21 ); - if(d>1) { - const double u2_03= 2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03, - u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05, - u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07, - u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09, - u2_11=u2_10*d10+2*u1_10,u2_12=u2_11*d11+2*u1_11, - u2_13=u2_12*d12+2*u1_12,u2_14=u2_13*d13+2*u1_13, - u2_15=u2_14*d14+2*u1_14,u2_16=u2_15*d15+2*u1_15, - u2_17=u2_16*d16+2*u1_16,u2_18=u2_17*d17+2*u1_17, - u2_19=u2_18*d18+2*u1_18,u2_20=u2_19*d19+2*u1_19, - u2_21=u2_20*d20+2*u1_20; - const double v2_18=d19* 2+2*v1_19,v2_17=d18*v2_18+2*v1_18, - v2_16=d17*v2_17+2*v1_17,v2_15=d16*v2_16+2*v1_16, - v2_14=d15*v2_15+2*v1_15,v2_13=d14*v2_14+2*v1_14, - v2_12=d13*v2_13+2*v1_13,v2_11=d12*v2_12+2*v1_12, - v2_10=d11*v2_11+2*v1_11,v2_09=d10*v2_10+2*v1_10, - v2_08=d09*v2_09+2*v1_09,v2_07=d08*v2_08+2*v1_08, - v2_06=d07*v2_07+2*v1_07,v2_05=d06*v2_06+2*v1_06, - v2_04=d05*v2_05+2*v1_05,v2_03=d04*v2_04+2*v1_04, - v2_02=d03*v2_03+2*v1_03,v2_01=d02*v2_02+2*v1_02, - v2_00=d01*v2_01+2*v1_01; - p[2*22+ 0]=4*w[ 0]*( + 1*v2_00); - p[2*22+ 1]=4*w[ 1]*( +2* 1*v1_01+u0_01*v2_01); - p[2*22+ 2]=4*w[ 2]*( 2*v0_02+2*u1_02*v1_02+u0_02*v2_02); - p[2*22+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03); - p[2*22+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04); - p[2*22+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05); - p[2*22+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06); - p[2*22+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07); - p[2*22+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08); - p[2*22+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09*v2_09); - p[2*22+10]=4*w[10]*(u2_10*v0_10+2*u1_10*v1_10+u0_10*v2_10); - p[2*22+11]=4*w[11]*(u2_11*v0_11+2*u1_11*v1_11+u0_11*v2_11); - p[2*22+12]=4*w[12]*(u2_12*v0_12+2*u1_12*v1_12+u0_12*v2_12); - p[2*22+13]=4*w[13]*(u2_13*v0_13+2*u1_13*v1_13+u0_13*v2_13); - p[2*22+14]=4*w[14]*(u2_14*v0_14+2*u1_14*v1_14+u0_14*v2_14); - p[2*22+15]=4*w[15]*(u2_15*v0_15+2*u1_15*v1_15+u0_15*v2_15); - p[2*22+16]=4*w[16]*(u2_16*v0_16+2*u1_16*v1_16+u0_16*v2_16); - p[2*22+17]=4*w[17]*(u2_17*v0_17+2*u1_17*v1_17+u0_17*v2_17); - p[2*22+18]=4*w[18]*(u2_18*v0_18+2*u1_18*v1_18+u0_18*v2_18); - p[2*22+19]=4*w[19]*(u2_19*v0_19+2*u1_19*v1_19+u0_19* 2); - p[2*22+20]=4*w[20]*(u2_20*v0_20+2*u1_20* 1 ); - p[2*22+21]=4*w[21]*(u2_21* 1 ); - } - } -} - -static const double gllz_23[10] = { - 0.98552715587873257808146276673809909902061079213965, - 0.9517579557107102041356396798514291558483519254488, - 0.89945855804034501095016032034736715791179834813929, - 0.82965109665128588622320061929000488459851188301333, - 0.74369504117206068394516354306699679128721922895386, - 0.6432636444601362084761455336027687438913118818023, - 0.53031177113684416813011532015229981113034651492734, - 0.40703793791447482919595048821509563955195372399417, - 0.27584154894579306710687763267913520417319110660942, - 0.13927620404066839859186261298276693390854445717444 -}; - -static void gll_lag_23(double *restrict p, double *restrict w, - unsigned n, int d, double xh) -{ - const double x = xh*2; - const double d00=x+2 ,d01=x+2*gllz_23[ 0],d02=x+2*gllz_23[ 1], - d03=x+2*gllz_23[ 2],d04=x+2*gllz_23[ 3],d05=x+2*gllz_23[ 4], - d06=x+2*gllz_23[ 5],d07=x+2*gllz_23[ 6],d08=x+2*gllz_23[ 7], - d09=x+2*gllz_23[ 8],d10=x+2*gllz_23[ 9],d11=x , - d12=x-2*gllz_23[ 9],d13=x-2*gllz_23[ 8],d14=x-2*gllz_23[ 7], - d15=x-2*gllz_23[ 6],d16=x-2*gllz_23[ 5],d17=x-2*gllz_23[ 4], - d18=x-2*gllz_23[ 3],d19=x-2*gllz_23[ 2],d20=x-2*gllz_23[ 1], - d21=x-2*gllz_23[ 0],d22=x-2 ; - const double u0_01= 1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02, - u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05, - u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08, - u0_10=u0_09*d09,u0_11=u0_10*d10,u0_12=u0_11*d11, - u0_13=u0_12*d12,u0_14=u0_13*d13,u0_15=u0_14*d14, - u0_16=u0_15*d15,u0_17=u0_16*d16,u0_18=u0_17*d17, - u0_19=u0_18*d18,u0_20=u0_19*d19,u0_21=u0_20*d20, - u0_22=u0_21*d21; - const double v0_21=d22* 1,v0_20=d21*v0_21,v0_19=d20*v0_20, - v0_18=d19*v0_19,v0_17=d18*v0_18,v0_16=d17*v0_17, - v0_15=d16*v0_16,v0_14=d15*v0_15,v0_13=d14*v0_14, - v0_12=d13*v0_13,v0_11=d12*v0_12,v0_10=d11*v0_11, - v0_09=d10*v0_10,v0_08=d09*v0_09,v0_07=d08*v0_08, - v0_06=d07*v0_07,v0_05=d06*v0_06,v0_04=d05*v0_05, - v0_03=d04*v0_04,v0_02=d03*v0_03,v0_01=d02*v0_02, - v0_00=d01*v0_01; - p[ 0]=w[ 0]* 1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02; - p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05; - p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08; - p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11*v0_11; - p[12]=w[12]*u0_12*v0_12; p[13]=w[13]*u0_13*v0_13; p[14]=w[14]*u0_14*v0_14; - p[15]=w[15]*u0_15*v0_15; p[16]=w[16]*u0_16*v0_16; p[17]=w[17]*u0_17*v0_17; - p[18]=w[18]*u0_18*v0_18; p[19]=w[19]*u0_19*v0_19; p[20]=w[20]*u0_20*v0_20; - p[21]=w[21]*u0_21*v0_21; p[22]=w[22]*u0_22* 1; - if(d>0) { - const double u1_02= 1*d01+u0_01,u1_03=u1_02*d02+u0_02, - u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04, - u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06, - u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08, - u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10, - u1_12=u1_11*d11+u0_11,u1_13=u1_12*d12+u0_12, - u1_14=u1_13*d13+u0_13,u1_15=u1_14*d14+u0_14, - u1_16=u1_15*d15+u0_15,u1_17=u1_16*d16+u0_16, - u1_18=u1_17*d17+u0_17,u1_19=u1_18*d18+u0_18, - u1_20=u1_19*d19+u0_19,u1_21=u1_20*d20+u0_20, - u1_22=u1_21*d21+u0_21; - const double v1_20=d21* 1+v0_21,v1_19=d20*v1_20+v0_20, - v1_18=d19*v1_19+v0_19,v1_17=d18*v1_18+v0_18, - v1_16=d17*v1_17+v0_17,v1_15=d16*v1_16+v0_16, - v1_14=d15*v1_15+v0_15,v1_13=d14*v1_14+v0_14, - v1_12=d13*v1_13+v0_13,v1_11=d12*v1_12+v0_12, - v1_10=d11*v1_11+v0_11,v1_09=d10*v1_10+v0_10, - v1_08=d09*v1_09+v0_09,v1_07=d08*v1_08+v0_08, - v1_06=d07*v1_07+v0_07,v1_05=d06*v1_06+v0_06, - v1_04=d05*v1_05+v0_05,v1_03=d04*v1_04+v0_04, - v1_02=d03*v1_03+v0_03,v1_01=d02*v1_02+v0_02, - v1_00=d01*v1_01+v0_01; - p[23+ 0]=2*w[ 0]*( v1_00); - p[23+ 1]=2*w[ 1]*( 1*v0_01+u0_01*v1_01); - p[23+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02); - p[23+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03); - p[23+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04); - p[23+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05); - p[23+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06); - p[23+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07); - p[23+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08); - p[23+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09); - p[23+10]=2*w[10]*(u1_10*v0_10+u0_10*v1_10); - p[23+11]=2*w[11]*(u1_11*v0_11+u0_11*v1_11); - p[23+12]=2*w[12]*(u1_12*v0_12+u0_12*v1_12); - p[23+13]=2*w[13]*(u1_13*v0_13+u0_13*v1_13); - p[23+14]=2*w[14]*(u1_14*v0_14+u0_14*v1_14); - p[23+15]=2*w[15]*(u1_15*v0_15+u0_15*v1_15); - p[23+16]=2*w[16]*(u1_16*v0_16+u0_16*v1_16); - p[23+17]=2*w[17]*(u1_17*v0_17+u0_17*v1_17); - p[23+18]=2*w[18]*(u1_18*v0_18+u0_18*v1_18); - p[23+19]=2*w[19]*(u1_19*v0_19+u0_19*v1_19); - p[23+20]=2*w[20]*(u1_20*v0_20+u0_20*v1_20); - p[23+21]=2*w[21]*(u1_21*v0_21+u0_21* 1); - p[23+22]=2*w[22]*(u1_22 ); - if(d>1) { - const double u2_03= 2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03, - u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05, - u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07, - u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09, - u2_11=u2_10*d10+2*u1_10,u2_12=u2_11*d11+2*u1_11, - u2_13=u2_12*d12+2*u1_12,u2_14=u2_13*d13+2*u1_13, - u2_15=u2_14*d14+2*u1_14,u2_16=u2_15*d15+2*u1_15, - u2_17=u2_16*d16+2*u1_16,u2_18=u2_17*d17+2*u1_17, - u2_19=u2_18*d18+2*u1_18,u2_20=u2_19*d19+2*u1_19, - u2_21=u2_20*d20+2*u1_20,u2_22=u2_21*d21+2*u1_21; - const double v2_19=d20* 2+2*v1_20,v2_18=d19*v2_19+2*v1_19, - v2_17=d18*v2_18+2*v1_18,v2_16=d17*v2_17+2*v1_17, - v2_15=d16*v2_16+2*v1_16,v2_14=d15*v2_15+2*v1_15, - v2_13=d14*v2_14+2*v1_14,v2_12=d13*v2_13+2*v1_13, - v2_11=d12*v2_12+2*v1_12,v2_10=d11*v2_11+2*v1_11, - v2_09=d10*v2_10+2*v1_10,v2_08=d09*v2_09+2*v1_09, - v2_07=d08*v2_08+2*v1_08,v2_06=d07*v2_07+2*v1_07, - v2_05=d06*v2_06+2*v1_06,v2_04=d05*v2_05+2*v1_05, - v2_03=d04*v2_04+2*v1_04,v2_02=d03*v2_03+2*v1_03, - v2_01=d02*v2_02+2*v1_02,v2_00=d01*v2_01+2*v1_01; - p[2*23+ 0]=4*w[ 0]*( + 1*v2_00); - p[2*23+ 1]=4*w[ 1]*( +2* 1*v1_01+u0_01*v2_01); - p[2*23+ 2]=4*w[ 2]*( 2*v0_02+2*u1_02*v1_02+u0_02*v2_02); - p[2*23+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03); - p[2*23+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04); - p[2*23+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05); - p[2*23+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06); - p[2*23+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07); - p[2*23+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08); - p[2*23+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09*v2_09); - p[2*23+10]=4*w[10]*(u2_10*v0_10+2*u1_10*v1_10+u0_10*v2_10); - p[2*23+11]=4*w[11]*(u2_11*v0_11+2*u1_11*v1_11+u0_11*v2_11); - p[2*23+12]=4*w[12]*(u2_12*v0_12+2*u1_12*v1_12+u0_12*v2_12); - p[2*23+13]=4*w[13]*(u2_13*v0_13+2*u1_13*v1_13+u0_13*v2_13); - p[2*23+14]=4*w[14]*(u2_14*v0_14+2*u1_14*v1_14+u0_14*v2_14); - p[2*23+15]=4*w[15]*(u2_15*v0_15+2*u1_15*v1_15+u0_15*v2_15); - p[2*23+16]=4*w[16]*(u2_16*v0_16+2*u1_16*v1_16+u0_16*v2_16); - p[2*23+17]=4*w[17]*(u2_17*v0_17+2*u1_17*v1_17+u0_17*v2_17); - p[2*23+18]=4*w[18]*(u2_18*v0_18+2*u1_18*v1_18+u0_18*v2_18); - p[2*23+19]=4*w[19]*(u2_19*v0_19+2*u1_19*v1_19+u0_19*v2_19); - p[2*23+20]=4*w[20]*(u2_20*v0_20+2*u1_20*v1_20+u0_20* 2); - p[2*23+21]=4*w[21]*(u2_21*v0_21+2*u1_21* 1 ); - p[2*23+22]=4*w[22]*(u2_22* 1 ); - } - } -} - -static const double gllz_24[11] = { - 0.9867305535051608835530867381544749753719197924133, - 0.95574822092988635802697713055064483107073304295574, - 0.90770567511350652199515299646620774920842011387828, - 0.84346407015487204062330503742334228584107610081033, - 0.7641704824204933077873752809522936513210604492369, - 0.67124010526412869983566485818700675657402328894643, - 0.56633135797929531218940954454228377043889499712648, - 0.45131637321432261824821849156962244882308821831249, - 0.3282476133755109120333891793596093437011778687727, - 0.19932125339083266723657253912499073081187559142148, - 0.066837993737228578113641808391677309796223208917628 -}; - -static void gll_lag_24(double *restrict p, double *restrict w, - unsigned n, int d, double xh) -{ - const double x = xh*2; - const double d00=x+2 ,d01=x+2*gllz_24[ 0],d02=x+2*gllz_24[ 1], - d03=x+2*gllz_24[ 2],d04=x+2*gllz_24[ 3],d05=x+2*gllz_24[ 4], - d06=x+2*gllz_24[ 5],d07=x+2*gllz_24[ 6],d08=x+2*gllz_24[ 7], - d09=x+2*gllz_24[ 8],d10=x+2*gllz_24[ 9],d11=x+2*gllz_24[10], - d12=x-2*gllz_24[10],d13=x-2*gllz_24[ 9],d14=x-2*gllz_24[ 8], - d15=x-2*gllz_24[ 7],d16=x-2*gllz_24[ 6],d17=x-2*gllz_24[ 5], - d18=x-2*gllz_24[ 4],d19=x-2*gllz_24[ 3],d20=x-2*gllz_24[ 2], - d21=x-2*gllz_24[ 1],d22=x-2*gllz_24[ 0],d23=x-2 ; - const double u0_01= 1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02, - u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05, - u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08, - u0_10=u0_09*d09,u0_11=u0_10*d10,u0_12=u0_11*d11, - u0_13=u0_12*d12,u0_14=u0_13*d13,u0_15=u0_14*d14, - u0_16=u0_15*d15,u0_17=u0_16*d16,u0_18=u0_17*d17, - u0_19=u0_18*d18,u0_20=u0_19*d19,u0_21=u0_20*d20, - u0_22=u0_21*d21,u0_23=u0_22*d22; - const double v0_22=d23* 1,v0_21=d22*v0_22,v0_20=d21*v0_21, - v0_19=d20*v0_20,v0_18=d19*v0_19,v0_17=d18*v0_18, - v0_16=d17*v0_17,v0_15=d16*v0_16,v0_14=d15*v0_15, - v0_13=d14*v0_14,v0_12=d13*v0_13,v0_11=d12*v0_12, - v0_10=d11*v0_11,v0_09=d10*v0_10,v0_08=d09*v0_09, - v0_07=d08*v0_08,v0_06=d07*v0_07,v0_05=d06*v0_06, - v0_04=d05*v0_05,v0_03=d04*v0_04,v0_02=d03*v0_03, - v0_01=d02*v0_02,v0_00=d01*v0_01; - p[ 0]=w[ 0]* 1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02; - p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05; - p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08; - p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11*v0_11; - p[12]=w[12]*u0_12*v0_12; p[13]=w[13]*u0_13*v0_13; p[14]=w[14]*u0_14*v0_14; - p[15]=w[15]*u0_15*v0_15; p[16]=w[16]*u0_16*v0_16; p[17]=w[17]*u0_17*v0_17; - p[18]=w[18]*u0_18*v0_18; p[19]=w[19]*u0_19*v0_19; p[20]=w[20]*u0_20*v0_20; - p[21]=w[21]*u0_21*v0_21; p[22]=w[22]*u0_22*v0_22; p[23]=w[23]*u0_23* 1; - if(d>0) { - const double u1_02= 1*d01+u0_01,u1_03=u1_02*d02+u0_02, - u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04, - u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06, - u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08, - u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10, - u1_12=u1_11*d11+u0_11,u1_13=u1_12*d12+u0_12, - u1_14=u1_13*d13+u0_13,u1_15=u1_14*d14+u0_14, - u1_16=u1_15*d15+u0_15,u1_17=u1_16*d16+u0_16, - u1_18=u1_17*d17+u0_17,u1_19=u1_18*d18+u0_18, - u1_20=u1_19*d19+u0_19,u1_21=u1_20*d20+u0_20, - u1_22=u1_21*d21+u0_21,u1_23=u1_22*d22+u0_22; - const double v1_21=d22* 1+v0_22,v1_20=d21*v1_21+v0_21, - v1_19=d20*v1_20+v0_20,v1_18=d19*v1_19+v0_19, - v1_17=d18*v1_18+v0_18,v1_16=d17*v1_17+v0_17, - v1_15=d16*v1_16+v0_16,v1_14=d15*v1_15+v0_15, - v1_13=d14*v1_14+v0_14,v1_12=d13*v1_13+v0_13, - v1_11=d12*v1_12+v0_12,v1_10=d11*v1_11+v0_11, - v1_09=d10*v1_10+v0_10,v1_08=d09*v1_09+v0_09, - v1_07=d08*v1_08+v0_08,v1_06=d07*v1_07+v0_07, - v1_05=d06*v1_06+v0_06,v1_04=d05*v1_05+v0_05, - v1_03=d04*v1_04+v0_04,v1_02=d03*v1_03+v0_03, - v1_01=d02*v1_02+v0_02,v1_00=d01*v1_01+v0_01; - p[24+ 0]=2*w[ 0]*( v1_00); - p[24+ 1]=2*w[ 1]*( 1*v0_01+u0_01*v1_01); - p[24+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02); - p[24+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03); - p[24+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04); - p[24+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05); - p[24+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06); - p[24+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07); - p[24+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08); - p[24+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09); - p[24+10]=2*w[10]*(u1_10*v0_10+u0_10*v1_10); - p[24+11]=2*w[11]*(u1_11*v0_11+u0_11*v1_11); - p[24+12]=2*w[12]*(u1_12*v0_12+u0_12*v1_12); - p[24+13]=2*w[13]*(u1_13*v0_13+u0_13*v1_13); - p[24+14]=2*w[14]*(u1_14*v0_14+u0_14*v1_14); - p[24+15]=2*w[15]*(u1_15*v0_15+u0_15*v1_15); - p[24+16]=2*w[16]*(u1_16*v0_16+u0_16*v1_16); - p[24+17]=2*w[17]*(u1_17*v0_17+u0_17*v1_17); - p[24+18]=2*w[18]*(u1_18*v0_18+u0_18*v1_18); - p[24+19]=2*w[19]*(u1_19*v0_19+u0_19*v1_19); - p[24+20]=2*w[20]*(u1_20*v0_20+u0_20*v1_20); - p[24+21]=2*w[21]*(u1_21*v0_21+u0_21*v1_21); - p[24+22]=2*w[22]*(u1_22*v0_22+u0_22* 1); - p[24+23]=2*w[23]*(u1_23 ); - if(d>1) { - const double u2_03= 2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03, - u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05, - u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07, - u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09, - u2_11=u2_10*d10+2*u1_10,u2_12=u2_11*d11+2*u1_11, - u2_13=u2_12*d12+2*u1_12,u2_14=u2_13*d13+2*u1_13, - u2_15=u2_14*d14+2*u1_14,u2_16=u2_15*d15+2*u1_15, - u2_17=u2_16*d16+2*u1_16,u2_18=u2_17*d17+2*u1_17, - u2_19=u2_18*d18+2*u1_18,u2_20=u2_19*d19+2*u1_19, - u2_21=u2_20*d20+2*u1_20,u2_22=u2_21*d21+2*u1_21, - u2_23=u2_22*d22+2*u1_22; - const double v2_20=d21* 2+2*v1_21,v2_19=d20*v2_20+2*v1_20, - v2_18=d19*v2_19+2*v1_19,v2_17=d18*v2_18+2*v1_18, - v2_16=d17*v2_17+2*v1_17,v2_15=d16*v2_16+2*v1_16, - v2_14=d15*v2_15+2*v1_15,v2_13=d14*v2_14+2*v1_14, - v2_12=d13*v2_13+2*v1_13,v2_11=d12*v2_12+2*v1_12, - v2_10=d11*v2_11+2*v1_11,v2_09=d10*v2_10+2*v1_10, - v2_08=d09*v2_09+2*v1_09,v2_07=d08*v2_08+2*v1_08, - v2_06=d07*v2_07+2*v1_07,v2_05=d06*v2_06+2*v1_06, - v2_04=d05*v2_05+2*v1_05,v2_03=d04*v2_04+2*v1_04, - v2_02=d03*v2_03+2*v1_03,v2_01=d02*v2_02+2*v1_02, - v2_00=d01*v2_01+2*v1_01; - p[2*24+ 0]=4*w[ 0]*( + 1*v2_00); - p[2*24+ 1]=4*w[ 1]*( +2* 1*v1_01+u0_01*v2_01); - p[2*24+ 2]=4*w[ 2]*( 2*v0_02+2*u1_02*v1_02+u0_02*v2_02); - p[2*24+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03); - p[2*24+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04); - p[2*24+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05); - p[2*24+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06); - p[2*24+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07); - p[2*24+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08); - p[2*24+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09*v2_09); - p[2*24+10]=4*w[10]*(u2_10*v0_10+2*u1_10*v1_10+u0_10*v2_10); - p[2*24+11]=4*w[11]*(u2_11*v0_11+2*u1_11*v1_11+u0_11*v2_11); - p[2*24+12]=4*w[12]*(u2_12*v0_12+2*u1_12*v1_12+u0_12*v2_12); - p[2*24+13]=4*w[13]*(u2_13*v0_13+2*u1_13*v1_13+u0_13*v2_13); - p[2*24+14]=4*w[14]*(u2_14*v0_14+2*u1_14*v1_14+u0_14*v2_14); - p[2*24+15]=4*w[15]*(u2_15*v0_15+2*u1_15*v1_15+u0_15*v2_15); - p[2*24+16]=4*w[16]*(u2_16*v0_16+2*u1_16*v1_16+u0_16*v2_16); - p[2*24+17]=4*w[17]*(u2_17*v0_17+2*u1_17*v1_17+u0_17*v2_17); - p[2*24+18]=4*w[18]*(u2_18*v0_18+2*u1_18*v1_18+u0_18*v2_18); - p[2*24+19]=4*w[19]*(u2_19*v0_19+2*u1_19*v1_19+u0_19*v2_19); - p[2*24+20]=4*w[20]*(u2_20*v0_20+2*u1_20*v1_20+u0_20*v2_20); - p[2*24+21]=4*w[21]*(u2_21*v0_21+2*u1_21*v1_21+u0_21* 2); - p[2*24+22]=4*w[22]*(u2_22*v0_22+2*u1_22* 1 ); - p[2*24+23]=4*w[23]*(u2_23* 1 ); - } - } -} - -static const double *const gllz_table[21] = { - gllz_04, gllz_05, gllz_06, gllz_07, gllz_08, gllz_09, gllz_10, gllz_11, - gllz_12, gllz_13, gllz_14, gllz_15, gllz_16, gllz_17, gllz_18, gllz_19, - gllz_20, gllz_21, gllz_22, gllz_23, gllz_24 -}; - -static lagrange_fun *const gll_lag_table[23] = { - &gll_lag_02, &gll_lag_03, &gll_lag_04, &gll_lag_05, &gll_lag_06, &gll_lag_07, - &gll_lag_08, &gll_lag_09, &gll_lag_10, &gll_lag_11, &gll_lag_12, &gll_lag_13, - &gll_lag_14, &gll_lag_15, &gll_lag_16, &gll_lag_17, &gll_lag_18, &gll_lag_19, - &gll_lag_20, &gll_lag_21, &gll_lag_22, &gll_lag_23, &gll_lag_24 -}; - diff --git a/3rdParty/gslib/src/rand_elt_test.c b/3rdParty/gslib/src/rand_elt_test.c deleted file mode 100644 index 1e11dae96..000000000 --- a/3rdParty/gslib/src/rand_elt_test.c +++ /dev/null @@ -1,169 +0,0 @@ -#include -#include -#include "c99.h" -#include "types.h" -#include "name.h" -#include "poly.h" -#include "lob_bnd.h" - -static double det_2(const double A[4]) { return A[0]*A[3]-A[1]*A[2]; } - -static double quad_2(const double x0, const double g[2], const double H[3], - const double r[2]) -{ - return x0 + (g[0]*r[0]+g[1]*r[1]) - + ( r[0] * (H[0]*r[0]+H[1]*r[1]) - + r[1] * (H[1]*r[0]+H[2]*r[1]) )/2; -} - -static void quad_2_grad(double grad[2], const double g[2], const double H[3], - const double r[2]) -{ - grad[0] = g[0] + (H[0]*r[0]+H[1]*r[1]); - grad[1] = g[1] + (H[1]*r[0]+H[2]*r[1]); -} - -static double quad_2_jac(const double g[4], const double H[6], - const double r[2]) -{ - double J[4]; - quad_2_grad(J ,g ,H ,r); - quad_2_grad(J+2,g+2,H+3,r); - return det_2(J); -} - -static double det_3(const double A[9]) -{ - const double a = A[4]*A[8]-A[5]*A[7], - b = A[5]*A[6]-A[3]*A[8], - c = A[3]*A[7]-A[4]*A[6]; - return A[0]*a+A[1]*b+A[2]*c; -} - -static double quad_3(const double x0, const double g[3], const double H[6], - const double r[3]) -{ - return x0 + (g[0]*r[0]+g[1]*r[1]+g[2]*r[2]) - + ( r[0] * (H[0]*r[0]+H[1]*r[1]+H[2]*r[2]) - + r[1] * (H[1]*r[0]+H[3]*r[1]+H[4]*r[2]) - + r[2] * (H[2]*r[0]+H[4]*r[1]+H[5]*r[2]) )/2; -} - -static void quad_3_grad(double grad[3], const double g[3], const double H[6], - const double r[3]) -{ - grad[0] = g[0] + (H[0]*r[0]+H[1]*r[1]+H[2]*r[2]); - grad[1] = g[1] + (H[1]*r[0]+H[3]*r[1]+H[4]*r[2]); - grad[2] = g[2] + (H[2]*r[0]+H[4]*r[1]+H[5]*r[2]); -} - -static double quad_3_jac(const double g[9], const double H[18], - const double r[3]) -{ - double J[9]; - quad_3_grad(J ,g ,H ,r); - quad_3_grad(J+3,g+3,H+ 6,r); - quad_3_grad(J+6,g+6,H+12,r); - return det_3(J); -} - -void rand_elt_2(double *x, double *y, - const double *zr, unsigned nr, - const double *zs, unsigned ns) -{ - static int init=0; - static double z4[4], lob_bnd_data[16+3*4*(2*16+1)], - work[2*16*(4+16+1)]; - unsigned i,j; - double x0[2], g[4], H[6], jac[4*4], r[2]; - struct dbl_range jr; - if(!init) { - init=1; - lobatto_nodes(z4,4); - lob_bnd_setup(lob_bnd_data,4,16); - } - do { - for(i=0;i<4;++i) g[i] = -1+2*(rand()/(double)RAND_MAX); - for(i=0;i<6;++i) H[i] =.5*(-1+2*(rand()/(double)RAND_MAX)); - for(j=0;j<4;++j) { r[1] = z4[j]; - for(i=0;i<4;++i) { r[0] = z4[i]; - jac[j*4+i] = quad_2_jac(g,H,r); - } - } - jr = lob_bnd_2(lob_bnd_data,4,16, lob_bnd_data,4,16, jac, work); - /*printf("Jacobian range %g, %g\n", jr.min, jr.max);*/ - } while(jr.max*jr.min<=0); - for(i=0;i< 2;++i) x0[i] = -1+2*(rand()/(double)RAND_MAX); - for(j=0;j -#include -#include -#include "c99.h" -#include "name.h" -#include "types.h" -#include "fail.h" -#include "mem.h" -#include "sort.h" - -#define sarray_permute_ PREFIXED_NAME(sarray_permute_) -#define sarray_permute_buf_ PREFIXED_NAME(sarray_permute_buf_) - -void sarray_permute_(size_t size, void *A, size_t n, uint *perm, void *work) -{ - char *const ar = A, *const item = work; - sint *const fperm = (sint*)perm; - uint i; - for(i=0;iptr, - (char*)buf->ptr + align_as_(align,n*sizeof(uint))); -} diff --git a/3rdParty/gslib/src/sarray_sort.h b/3rdParty/gslib/src/sarray_sort.h deleted file mode 100644 index 77dc6531e..000000000 --- a/3rdParty/gslib/src/sarray_sort.h +++ /dev/null @@ -1,89 +0,0 @@ -#ifndef SARRAY_SORT_H -#define SARRAY_SORT_H - -#if !defined(SORT_H) -#warning "sarray_sort.h" requires "sort.h" -#endif - -/*------------------------------------------------------------------------------ - - Array of Structs Sort - - buffer *buf; - typedef struct { ... } T; - T A[n]; - - sarray_sort(T,A,n, field_name,is_long, buf) - - sort A according to the struct field "field_name", - which is a ulong/uint field according as is_long is true/false - - sarray_sort_2(T,A,n, field1,is_long1, field2,is_long2, buf) - - sort A by field1 then field2 - - sarray_permute(T,A,n, perm, work) - - permute A (in-place) - A[0] <- A[perm[0]], etc. - work needs to hold sizeof(T) bytes (i.e., 1 T) - - sarray_permute_buf(T,A,n, buf); - - permute A according to the permutation in buf - A[0] <- A[perm[0]], etc. - where uint *perm = buf->ptr (see "sort.h") - - ----------------------------------------------------------------------------*/ - - -#define sarray_permute_ PREFIXED_NAME(sarray_permute_) -#define sarray_permute_buf_ PREFIXED_NAME(sarray_permute_buf_) - -void sarray_permute_(size_t size, void *A, size_t n, uint *perm, void *work); -void sarray_permute_buf_( - size_t align, size_t size, void *A, size_t n, buffer *buf); - -#define sarray_permute(T,A,n, perm, work) \ - sarray_permute_(sizeof(T),A,n, perm, work) -#define sarray_permute_buf(T,A,n, buf) \ - sarray_permute_buf_(ALIGNOF(T),sizeof(T),A,n,buf) - -#define sarray_sort_field(T,A,n, field,is_long, buf,keep) do { \ - if(is_long) \ - sortp_long(buf,keep, (ulong*)((char*)(A)+offsetof(T,field)),n,sizeof(T)); \ - else \ - sortp (buf,keep, (uint *)((char*)(A)+offsetof(T,field)),n,sizeof(T)); \ -} while (0) - -#define sarray_sort(T,A,n, field,is_long, buf) do { \ - sarray_sort_field(T,A,n, field,is_long, buf,0); \ - sarray_permute_buf(T,A,n, buf); \ -} while (0) - -#define sarray_sort_2(T,A,n, field1,is_long1, field2,is_long2, buf) do { \ - sarray_sort_field(T,A,n, field2,is_long2, buf,0); \ - sarray_sort_field(T,A,n, field1,is_long1, buf,1); \ - sarray_permute_buf(T,A,n, buf); \ -} while (0) - -#define sarray_sort_3(T,A,n, field1,is_long1, field2,is_long2, \ - field3,is_long3, buf) do { \ - sarray_sort_field(T,A,n, field3,is_long3, buf,0); \ - sarray_sort_field(T,A,n, field2,is_long2, buf,1); \ - sarray_sort_field(T,A,n, field1,is_long1, buf,1); \ - sarray_permute_buf(T,A,n, buf); \ -} while (0) - -#define sarray_sort_4(T,A,n, field1,is_long1, field2,is_long2, \ - field3,is_long3, field4,is_long4, buf) do { \ - sarray_sort_field(T,A,n, field4,is_long4, buf,0); \ - sarray_sort_field(T,A,n, field3,is_long3, buf,1); \ - sarray_sort_field(T,A,n, field2,is_long2, buf,1); \ - sarray_sort_field(T,A,n, field1,is_long1, buf,1); \ - sarray_permute_buf(T,A,n, buf); \ -} while (0) - -static void sarray_perm_invert( - uint *const pinv, const uint *const perm, const uint n) -{ - uint i; for(i=0;i -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#include "comm.h" -#include "mem.h" -#include "crystal.h" -#include "sort.h" - -#define sarray_transfer_many PREFIXED_NAME(sarray_transfer_many) -#define sarray_transfer_ PREFIXED_NAME(sarray_transfer_ ) -#define sarray_transfer_ext_ PREFIXED_NAME(sarray_transfer_ext_) - -static void pack_int( - buffer *const data, const unsigned row_size, const uint id, - const char *const restrict input, const uint n, const unsigned size, - const unsigned p_off, const uint *const restrict perm) -{ - const unsigned after = p_off + sizeof(uint), after_len = size-after; - -#define GET_P() memcpy(&p,row+p_off,sizeof(uint)) -#define COPY_ROW() memcpy(out,row,p_off), \ - memcpy((char*)out + p_off,row+after,after_len) - -#define PACK_BODY() do { \ - uint dummy, *len_ptr=&dummy; \ - uint i, p,lp = UINT_MAX, len=0; \ - uint *restrict out = buffer_reserve(data, n*(row_size+3)*sizeof(uint)); \ - for(i=0;in = out - (uint*)data->ptr; \ -} while(0) - PACK_BODY(); -#undef COPY_ROW -#undef GET_P -} - -static void pack_ext( - buffer *const data, const unsigned row_size, const uint id, - const char *const restrict input, const uint n, const unsigned size, - const uint *const restrict proc, const unsigned proc_stride, - const uint *const restrict perm) -{ - #define GET_P() p=*(const uint*)((const char*)proc+proc_stride*perm[i]) - #define COPY_ROW() memcpy(out,row,size) - PACK_BODY(); - #undef PACK_BODY - #undef COPY_ROW - #undef GET_P -} - -static void pack_more( - buffer *const data, const unsigned off, const unsigned row_size, - const char *const restrict input, const unsigned size, - const uint *restrict perm) -{ - uint *restrict buf = data->ptr, *buf_end = buf+data->n; - while(buf!=buf_end) { - uint *msg_end = buf+3+buf[2]; buf+=3; - while(buf!=msg_end) - memcpy((char*)buf+off, input+size*(*perm++), size), buf+=row_size; - } -} - -static void unpack_more( - char *restrict out, const unsigned size, - const buffer *const data, const unsigned off, const unsigned row_size) -{ - const uint *restrict buf = data->ptr, *buf_end = buf+data->n; - while(buf!=buf_end) { - const uint *msg_end = buf+3+buf[2]; buf+=3; - while(buf!=msg_end) - memcpy(out, (char*)buf+off, size), out+=size, buf+=row_size; - } -} - -static void unpack_int( - char *restrict out, const unsigned size, const unsigned p_off, - const buffer *const data, const unsigned row_size, int set_src) -{ - const unsigned after = p_off + sizeof(uint), after_len = size-after; - const uint *restrict buf = data->ptr, *buf_end = buf+data->n; - const unsigned pi = set_src ? 1:0; - while(buf!=buf_end) { - const uint p=buf[pi], *msg_end = buf+3+buf[2]; buf+=3; - while(buf!=msg_end) { - memcpy(out,buf,p_off); - memcpy(out+p_off,&p,sizeof(uint)); - memcpy(out+after,(const char *)buf+p_off,after_len); - out+=size, buf+=row_size; - } - } -} - -static uint num_rows(const buffer *const data, const unsigned row_size) -{ - const uint *buf = data->ptr, *buf_end = buf + data->n; - uint n=0; - while(buf!=buf_end) { uint len=buf[2]; n+=len, buf+=len+3; } - return n/row_size; -} - -static uint cap_rows(buffer *const data, const unsigned row_size,const uint max) -{ - uint *buf = data->ptr, *buf_end = buf + data->n; - const uint maxn = max*row_size; - uint n=0; - while(buf!=buf_end) { - uint len=buf[2]; n+=len; - if(nn = (buf-(uint*)data->ptr)+3+buf[2]; - buf+=len+3; - while(buf!=buf_end) { uint llen=buf[2]; n+=llen, buf+=llen+3; } - break; - } - } - return n/row_size; -} - -/* An must be >= 1 */ -uint sarray_transfer_many( - struct array *const *const A, const unsigned *const size, const unsigned An, - const int fixed, const int ext, const int set_src, const unsigned p_off, - const uint *const restrict proc, const unsigned proc_stride, - struct crystal *const cr) -{ - uint n, *perm; - unsigned i,row_size,off,off1; - - off1 = size[0]; - if(!ext) off1 -= sizeof(uint); - row_size=off1; for(i=1;iwork,0, proc,A[0]->n,proc_stride); - - if(!ext) pack_int(&cr->data, row_size, cr->comm.id, A[0]->ptr,A[0]->n,size[0], - p_off, perm); - else pack_ext(&cr->data, row_size, cr->comm.id, A[0]->ptr,A[0]->n,size[0], - proc,proc_stride, perm); - for(off=off1,i=1;idata,off,row_size, A[i]->ptr,size[i], perm),off+=size[i]; - - crystal_router(cr); - - if(!fixed) { - n = num_rows(&cr->data,row_size); - for(i=0;in=n; - } else { - uint max=A[0]->max, an; - for(i=1;imaxmax; - n = cap_rows(&cr->data,row_size, max); - an = n>max?max:n; - for(i=0;in=an; - } - - if(!ext) unpack_int (A[0]->ptr,size[0],p_off, &cr->data, row_size, set_src); - else unpack_more(A[0]->ptr,size[0], &cr->data,0,row_size); - for(off=off1,i=1;iptr,size[i], &cr->data,off,row_size),off+=size[i]; - - return n; -} - - -void sarray_transfer_(struct array *const A, const unsigned size, - const unsigned p_off, const int set_src, - struct crystal *const cr) -{ - sarray_transfer_many(&A,&size,1, 0,0,set_src,p_off, - (uint*)((char*)A->ptr+p_off),size, cr); -} - -void sarray_transfer_ext_(struct array *const A, const unsigned size, - const uint *const proc, const unsigned proc_stride, - struct crystal *const cr) -{ - sarray_transfer_many(&A,&size,1, 0,1,0,0, proc,proc_stride, cr); -} - diff --git a/3rdParty/gslib/src/sarray_transfer.h b/3rdParty/gslib/src/sarray_transfer.h deleted file mode 100644 index c195e2174..000000000 --- a/3rdParty/gslib/src/sarray_transfer.h +++ /dev/null @@ -1,95 +0,0 @@ -#ifndef SARRAY_TRANSFER_H -#define SARRAY_TRANSFER_H - -#if !defined(CRYSTAL_H) -#warning "sarray_transfer.h" requires "crystal.h" -#endif - -/* - High-level interface for the crystal router. - Given an array of structs, transfers each to the process indicated - by a field of the struct, which gets set to the source process on output. - - For the dynamic "array" type, see "mem.h". - - Requires a "crystal router" object: - - struct comm c; - struct crystal cr; - - comm_init(&c, MPI_COMM_WORLD); - crystal_init(&cr, &c); - - Example sarray_transfer usage: - - struct T { ...; uint proc; ...; }; - struct array A = null_array; - struct T *p, *e; - - // resize A to 100 struct T's, fill up with data - p = array_reserve(struct T, &A, 100), A.n=100; - for(e=p+A.n;p!=e;++p) { - ... - p->proc = ...; - ... - } - - // array A represents the array - // struct T ar[A.n] where &ar[0] == A.ptr - // transfer ar[i] to processor ar[i].proc for each i=0,...,A.n-1: - - sarray_transfer(struct T, A, proc,set_src, &cr); - - // now array A represents a different array with a different size - // struct T ar[A.n] where &ar[0] == A.ptr - // the ordering is arbitrary - // if set_src != 0, ar[i].proc is set to the proc where ar[i] came from - // otherwise ar[i].proc is unchanged (and == this proc id) - - // note: two calls of - sarray_transfer(struct T, A, proc,1, &cr); - // in a row should return A to its original state, up to ordering - - Cleanup: - array_free(&A); - crystal_free(&cr); - comm_free(&c); - - Example sarray_transfer_ext usage: - - struct T { ... }; - struct array A; - uint proc[A.n]; - - // array A represents the array - // struct T ar[A.n] where &ar[0] == A.ptr - // transfer ar[i] to processor proc[i] for each i=0,...,A.n-1: - sarray_transfer_ext(struct T, &A, proc, &cr); - - // no information is available now on where each struct came from - -*/ - -#define sarray_transfer_many PREFIXED_NAME(sarray_transfer_many) -#define sarray_transfer_ PREFIXED_NAME(sarray_transfer_ ) -#define sarray_transfer_ext_ PREFIXED_NAME(sarray_transfer_ext_) - -uint sarray_transfer_many( - struct array *const *const A, const unsigned *const size, const unsigned An, - const int fixed, const int ext, const int set_src, const unsigned p_off, - const uint *const restrict proc, const unsigned proc_stride, - struct crystal *const cr); -void sarray_transfer_(struct array *const A, const unsigned size, - const unsigned p_off, const int set_src, - struct crystal *const cr); -void sarray_transfer_ext_(struct array *const A, const unsigned size, - const uint *const proc, const unsigned proc_stride, - struct crystal *const cr); - -#define sarray_transfer(T,A,proc_field,set_src,cr) \ - sarray_transfer_(A,sizeof(T),offsetof(T,proc_field),set_src,cr) - -#define sarray_transfer_ext(T,A,proc,proc_stride,cr) \ - sarray_transfer_ext_(A,sizeof(T),proc,proc_stride,cr) - -#endif diff --git a/3rdParty/gslib/src/sort.c b/3rdParty/gslib/src/sort.c deleted file mode 100644 index 5b25f429f..000000000 --- a/3rdParty/gslib/src/sort.c +++ /dev/null @@ -1,31 +0,0 @@ -#include -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#include "mem.h" - -#define T unsigned int -#define SORT_SUFFIX _ui -#include "sort_imp.h" -#undef SORT_SUFFIX -#undef T - -#if defined(USE_LONG) || defined(GLOBAL_LONG) -# define T unsigned long -# define SORT_SUFFIX _ul -# include "sort_imp.h" -# undef SORT_SUFFIX -# undef T -#endif - -#if defined(USE_LONG_LONG) || defined(GLOBAL_LONG_LONG) -# define T unsigned long long -# define SORT_SUFFIX _ull -# include "sort_imp.h" -# undef SORT_SUFFIX -# undef T -#endif diff --git a/3rdParty/gslib/src/sort.h b/3rdParty/gslib/src/sort.h deleted file mode 100644 index eaeeb957a..000000000 --- a/3rdParty/gslib/src/sort.h +++ /dev/null @@ -1,76 +0,0 @@ -#ifndef SORT_H -#define SORT_H - -#if !defined(TYPES_H) || !defined(MEM_H) -#warning "sort.h" requires "types.h" and "mem.h" -/* types.h defines uint, ulong - mem.h defines buffer */ -#endif - -/*------------------------------------------------------------------------------ - - Sort - - O(n) stable sort with good performance for all n - - sortv (uint *out, const uint *A, uint n, uint stride, buffer *buf) - sortv_long(ulong *out, const ulong *A, uint n, uint stride, buffer *buf) - - sortp (buffer *buf, int perm_start, const uint *A, uint n, uint stride) - sortp_long(buffer *buf, int perm_start, const ulong *A, uint n, uint stride) - - A, n, stride : specifices the input (stride is in bytes!) - out : the sorted values on output - - For the value sort, (sortv*) - A and out may alias (A == out) exactly when stride == sizeof(T) - - For the permutation sort, (sortp*) - The permutation can be both input (when start_perm!=0) and output, - following the convention that it is always at the start of the buffer buf: - uint *perm = buf->ptr; - The permutation denotes the ordering - A[perm[0]], A[perm[1]], ..., A[perm[n-1]] - (assuming stride == sizeof(uint) or sizeof(ulong) as appropriate) - and is re-arranged stably to give a sorted ordering. - Specifying start_perm==0 is equivalent to specifying - perm[i] = i, i=0,...,n-1 - for an initial permutation (but may be faster). - The buffer will be expanded as necessary to accomodate the permutation - and the required scratch space. - - Most code calls these routines indirectly via the higher-level routine - sarray_sort for sorting arrays of structs (see "sarray_sort.h"). - - ----------------------------------------------------------------------------*/ - -#define sortv_ui PREFIXED_NAME(sortv_ui) -#define sortv_ul PREFIXED_NAME(sortv_ul) -#define sortv_ull PREFIXED_NAME(sortv_ull) -#define sortp_ui PREFIXED_NAME(sortp_ui) -#define sortp_ul PREFIXED_NAME(sortp_ul) -#define sortp_ull PREFIXED_NAME(sortp_ull) - -#define sortv TYPE_LOCAL(sortv_ui,sortv_ul,sortv_ull) -#define sortp TYPE_LOCAL(sortp_ui,sortp_ul,sortp_ull) -#define sortv_long TYPE_GLOBAL(sortv_ui,sortv_ul,sortv_ull) -#define sortp_long TYPE_GLOBAL(sortp_ui,sortp_ul,sortp_ull) - -void sortv_ui(unsigned *out, const unsigned *A, uint n, unsigned stride, - buffer *restrict buf); -void sortv_ul(unsigned long *out, - const unsigned long *A, uint n, unsigned stride, - buffer *restrict buf); -uint *sortp_ui(buffer *restrict buf, int start_perm, - const unsigned *restrict A, uint n, unsigned stride); -uint *sortp_ul(buffer *restrict buf, int start_perm, - const unsigned long *restrict A, uint n, unsigned stride); -#if defined(USE_LONG_LONG) || defined(GLOBAL_LONG_LONG) -void sortv_ull(unsigned long long *out, - const unsigned long long *A, uint n, unsigned stride, - buffer *restrict buf); -uint *sortp_ull(buffer *restrict buf, int start_perm, - const unsigned long long *restrict A, uint n, unsigned stride); -#endif - -#endif diff --git a/3rdParty/gslib/src/sort_imp.h b/3rdParty/gslib/src/sort_imp.h deleted file mode 100644 index e772f91aa..000000000 --- a/3rdParty/gslib/src/sort_imp.h +++ /dev/null @@ -1,544 +0,0 @@ -#if !defined(T) || !defined(SORT_SUFFIX) -#error sort_imp.h not meant to be compiled by itself -#endif - -#define sort_data TOKEN_PASTE(sort_data ,SORT_SUFFIX) -#define radix_count TOKEN_PASTE(radix_count ,SORT_SUFFIX) -#define radix_offsets TOKEN_PASTE(radix_offsets ,SORT_SUFFIX) -#define radix_zeros TOKEN_PASTE(radix_zeros ,SORT_SUFFIX) -#define radix_passv TOKEN_PASTE(radix_passv ,SORT_SUFFIX) -#define radix_sortv TOKEN_PASTE(radix_sortv ,SORT_SUFFIX) -#define radix_passp0_b TOKEN_PASTE(radix_passp0_b ,SORT_SUFFIX) -#define radix_passp_b TOKEN_PASTE(radix_passp_b ,SORT_SUFFIX) -#define radix_passp_m TOKEN_PASTE(radix_passp_m ,SORT_SUFFIX) -#define radix_passp_e TOKEN_PASTE(radix_passp_e ,SORT_SUFFIX) -#define radix_passp0_be TOKEN_PASTE(radix_passp0_be,SORT_SUFFIX) -#define radix_passp_be TOKEN_PASTE(radix_passp_be, SORT_SUFFIX) -#define radix_sortp TOKEN_PASTE(radix_sortp ,SORT_SUFFIX) -#define merge_sortv TOKEN_PASTE(merge_sortv ,SORT_SUFFIX) -#define merge_copy_perm TOKEN_PASTE(merge_copy_perm,SORT_SUFFIX) -#define merge_sortp0 TOKEN_PASTE(merge_sortp0 ,SORT_SUFFIX) -#define merge_sortp TOKEN_PASTE(merge_sortp ,SORT_SUFFIX) -#define heap_sortv TOKEN_PASTE(heap_sortv ,SORT_SUFFIX) - -#define sortv PREFIXED_NAME(TOKEN_PASTE(sortv,SORT_SUFFIX)) -#define sortp PREFIXED_NAME(TOKEN_PASTE(sortp,SORT_SUFFIX)) - -typedef struct { T v; uint i; } sort_data; - -#define INC_PTR(A,stride) ((A)=(T*)((char*)(A)+(stride))) -#define INDEX_PTR(A,stride,i) (*(T*)((char*)(A)+(i)*(stride))) - -/*------------------------------------------------------------------------------ - - Radix Sort - - stable; O(n+k) time and extra storage - where k = (digits in an int) * 2^(bits per digit) - (e.g. k = 4 * 256 = 1024 for 32-bit ints with 8-bit digits) - - brief description: - input sorted stably on each digit, starting with the least significant - counting sort is used for each digit: - a pass through the input counts the occurences of each digit value - on a second pass, each input has a known destination - - tricks: - all counting passes are combined into one - the counting pass also computes the inclusive bit-wise or of all inputs, - which is used to skip digit positions for which all inputs have zeros - - ----------------------------------------------------------------------------*/ - -#define STATIC_DIGIT_BUCKETS 1 - -#define DIGIT_BITS 8 -#define DIGIT_VALUES (1<i) count[i][val&DIGIT_MASK]++, val>>=DIGIT_BITS -#define COUNT_DIGIT_02(n,i) COUNT_DIGIT_01(n,i); COUNT_DIGIT_01(n,i+ 1) -#define COUNT_DIGIT_04(n,i) COUNT_DIGIT_02(n,i); COUNT_DIGIT_02(n,i+ 2) -#define COUNT_DIGIT_08(n,i) COUNT_DIGIT_04(n,i); COUNT_DIGIT_04(n,i+ 4) -#define COUNT_DIGIT_16(n,i) COUNT_DIGIT_08(n,i); COUNT_DIGIT_08(n,i+ 8) -#define COUNT_DIGIT_32(n,i) COUNT_DIGIT_16(n,i); COUNT_DIGIT_16(n,i+16) -#define COUNT_DIGIT_64(n,i) COUNT_DIGIT_32(n,i); COUNT_DIGIT_32(n,i+32) - -static T radix_count( - uint (*restrict count)[DIGIT_VALUES], - const T *restrict A, const T *const end, const unsigned stride) -{ - T bitorkey = 0; - memset(count,0,COUNT_SIZE*sizeof(uint)); - do { - T val=*A; - bitorkey|=val; - COUNT_DIGIT_64(DIGITS,0); - /* above macro expands to: - if(DIGITS> 0) count[ 0][val&DIGIT_MASK]++, val>>=DIGIT_BITS; - if(DIGITS> 1) count[ 1][val&DIGIT_MASK]++, val>>=DIGIT_BITS; - ... - if(DIGITS>63) count[63][val&DIGIT_MASK]++, val>>=DIGIT_BITS; - */ - } while(INC_PTR(A,stride),A!=end); - return bitorkey; -} - -#undef COUNT_DIGIT_01 -#undef COUNT_DIGIT_02 -#undef COUNT_DIGIT_04 -#undef COUNT_DIGIT_08 -#undef COUNT_DIGIT_16 -#undef COUNT_DIGIT_32 -#undef COUNT_DIGIT_64 - -static void radix_offsets(uint *restrict c) -{ - uint *const ce = c+DIGIT_VALUES; - uint sum = 0; - do { - const uint c0=c[0], c1=c[1], c2=c[2], c3=c[3]; - const uint o1=sum+c0, o2=o1+c1, o3=o2+c2; - c[0]=sum, c[1]=o1, c[2]=o2, c[3]=o3; - sum = o3+c3; - c+=4; - } while(c!=ce); -} - -static unsigned radix_zeros( - T bitorkey, uint (*restrict count)[DIGIT_VALUES], - unsigned *restrict shift, uint **restrict offsets) -{ - unsigned digits=0, sh=0; uint *c = &count[0][0]; - do { - if(bitorkey&DIGIT_MASK) *shift++ = sh, *offsets++ = c, ++digits, - radix_offsets(c); - } while(bitorkey>>=DIGIT_BITS,sh+=DIGIT_BITS,c+=DIGIT_VALUES,sh!=VALUE_BITS); - return digits; -} - -static void radix_passv( - const T *restrict A, const T *const end, const unsigned stride, - const unsigned sh, uint *const restrict off, T *const restrict out) -{ - do out[off[(*A>>sh)&DIGIT_MASK]++] = *A; while(INC_PTR(A,stride),A!=end); -} - -static void radix_sortv( - T *out, const T *A, const uint n, const unsigned stride, - T *work, uint (*restrict count)[DIGIT_VALUES]) -{ - const T *const end = &INDEX_PTR(A,stride,n); - T bitorkey = radix_count(count, A,end,stride); - unsigned shift[DIGITS]; uint *offsets[DIGITS]; - const unsigned digits = radix_zeros(bitorkey,count,shift,offsets); - if(digits==0) { - memset(out,0,n*sizeof(T)); - } else { - T *src, *dst; unsigned d; - if(out==A || (digits&1)==0) dst=out,src=work; - else src=out,dst=work; - radix_passv(A,end,stride,shift[0],offsets[0],src); - for(d=1;d!=digits;++d) { - T *t; - radix_passv(src,src+n,sizeof(T),shift[d],offsets[d],dst); - t=src,src=dst,dst=t; - } - if(src!=out) memcpy(out,src,n*sizeof(T)); - } -} - -static void radix_passp0_b( - const T *restrict A, const uint n, const unsigned stride, - const unsigned sh, uint *const restrict off, - sort_data *const restrict out) -{ - uint i=0; - do { - T v = *A; - sort_data *d = &out[off[(v>>sh)&DIGIT_MASK]++]; - d->v=v, d->i=i++; - } while(INC_PTR(A,stride),i!=n); -} - -static void radix_passp_b( - const uint *restrict p, - const T *const restrict A, const uint n, const unsigned stride, - const unsigned sh, uint *const restrict off, - sort_data *const out) -{ - const uint *const pe = p+n; - do { - uint j = *p++; - T v = INDEX_PTR(A,stride,j); - sort_data *d = &out[off[(v>>sh)&DIGIT_MASK]++]; - d->v=v, d->i=j; - } while(p!=pe); -} - -static void radix_passp_m( - const sort_data *restrict src, const sort_data *const end, - const unsigned sh, uint *const restrict off, - sort_data *const restrict out) -{ - do { - sort_data *d = &out[off[(src->v>>sh)&DIGIT_MASK]++]; - d->v=src->v,d->i=src->i; - } while(++src!=end); -} - -static void radix_passp_e( - const sort_data *restrict src, const sort_data *const end, - const unsigned sh, uint *const restrict off, - uint *const restrict out) -{ - do out[off[(src->v>>sh)&DIGIT_MASK]++]=src->i; while(++src!=end); -} - -static void radix_passp0_be( - uint *const restrict out, - const T *restrict A, const uint n, const unsigned stride, - const unsigned sh, uint *const restrict off) -{ - uint i=0; - do out[off[(*A>>sh)&DIGIT_MASK]++]=i++; while(INC_PTR(A,stride),i!=n); -} - -static void radix_passp_be( - uint *restrict p, - const T *restrict A, const uint n, const unsigned stride, - const unsigned sh, uint *const restrict off, - sort_data *restrict work) -{ - uint *q = p, *const qe = p+n; - uint *w = &work[0].i; - do { - uint j = *q++; - T v = INDEX_PTR(A,stride,j); - w[off[(v>>sh)&DIGIT_MASK]++]=j; - } while(q!=qe); - memcpy(p,w,n*sizeof(uint)); -} - -static void radix_sortp( - uint *restrict idx, uint perm_start, - const T *restrict A, const uint n, const unsigned stride, - sort_data *restrict work, - uint (*restrict count)[DIGIT_VALUES]) -{ - T bitorkey = radix_count(count, A,&INDEX_PTR(A,stride,n),stride); - unsigned shift[DIGITS]; uint *offsets[DIGITS]; - unsigned digits = radix_zeros(bitorkey,count,shift,offsets); - if(digits==0) { - if(!perm_start) { uint i=0; do *idx++=i++; while(i!=n); } - } else if(digits==1) { - if(perm_start) radix_passp_be (idx,A,n,stride,shift[0],offsets[0],work); - else radix_passp0_be(idx,A,n,stride,shift[0],offsets[0]); - } else { - sort_data *src, *dst; unsigned d; - if((digits&1)==0) dst=work,src=dst+n; - else src=work,dst=src+n; - if(perm_start) radix_passp_b (idx,A,n,stride,shift[0],offsets[0],src); - else radix_passp0_b( A,n,stride,shift[0],offsets[0],src); - for(d=1;d!=digits-1;++d) { - sort_data *t; - radix_passp_m(src,src+n,shift[d],offsets[d],dst); - t=src,src=dst,dst=t; - } - radix_passp_e(src,src+n,shift[d],offsets[d],idx); - } -} - -/*------------------------------------------------------------------------------ - - Merge Sort - - stable; O(n log n) time - - ----------------------------------------------------------------------------*/ - -#define MERGE_2(p,v) \ - if(VAL(v[1])3) odd<<=1,odd|=(n&1),n>>=1,c<<=1,b^=1; \ - } else \ - base-=n-(odd&1),n<<=1,n-=(odd&1),odd>>=1,c>>=1; \ - if(c==0) break; \ - p = buf[b]+base; \ - if(n==2) { \ - DATA v[2]; SETVAL(v[0],i), SETVAL(v[1],i+1); \ - MERGE_2(p,v); \ - i+=2; \ - } else if(n==3) { \ - DATA v[3]; SETVAL(v[0],i), SETVAL(v[1],i+1), SETVAL(v[2],i+2); \ - MERGE_3(p,v); \ - i+=3; \ - } else { \ - const uint na = n>>1, nb = (n+1)>>1; \ - const DATA *restrict ap = buf[b^1]+base, *const ae = ap+na; \ - DATA *restrict bp = p+na, *const be = bp+nb; \ - for(;;) { \ - if(VAL((*bp))i; while(p!=pe);*/ - uint n_by_8 = (n+7)/8; - switch(n%8) { - case 0: do { *idx++ = (p++)->i; - case 7: *idx++ = (p++)->i; - case 6: *idx++ = (p++)->i; - case 5: *idx++ = (p++)->i; - case 4: *idx++ = (p++)->i; - case 3: *idx++ = (p++)->i; - case 2: *idx++ = (p++)->i; - case 1: *idx++ = (p++)->i; - } while (--n_by_8 > 0); - } -} - -static void merge_sortp0( - uint *restrict idx, - const T *restrict A, const uint An, const unsigned stride, - sort_data *restrict work) -{ - sort_data *buf[2]; buf[0]=work+An,buf[1]=work; -#define DATA sort_data -#define VAL(x) x.v -#define SETVAL(x,ai) x.v=*A,INC_PTR(A,stride),x.i=ai - MERGE_SORT(); -#undef SETVAL -#undef VAL -#undef DATA - merge_copy_perm(idx,buf[0],An); -} - -static void merge_sortp( - uint *restrict idx, - const T *const restrict A, const uint An, const unsigned stride, - sort_data *restrict work) -{ - sort_data *buf[2]; buf[0]=work+An,buf[1]=work; -#define DATA sort_data -#define VAL(x) x.v -#define SETVAL(x,ai) x.i=idx[ai],x.v=INDEX_PTR(A,stride,x.i) - MERGE_SORT(); -#undef SETVAL -#undef VAL -#undef DATA - merge_copy_perm(idx,buf[0],An); -} - -#undef MERGE_SORT -#undef MERGE_3 -#undef MERGE_2 - -/*------------------------------------------------------------------------------ - - Heap Sort - - in-place, stability unobservable; O(n log n) time - - ----------------------------------------------------------------------------*/ -static void heap_sortv(T *const restrict A, unsigned n) -{ - unsigned i; - /* build heap */ - for(i=1;i>1; - if(A[p] >= item) continue; - do A[h]=A[p], h=p, p=(p-1)>>1; while(h && A[p] < item); - A[h] = item; - } - /* extract */ - for(i=n-1;i;--i) { - T item = A[i]; - unsigned h = 0; - A[i] = A[0]; - for(;;) { - unsigned ch = 1+(h<<1), r = ch+1; - if(r=i || item >= A[ch]) break; - A[h]=A[ch], h=ch; - } - A[h] = item; - } -} - - -/*------------------------------------------------------------------------------ - - Hybrid Stable Sort - - low-overhead merge sort when n is small, - otherwise asymptotically superior radix sort - - result = O(n) sort with good performance for all n - - A, n, stride : specifices the input, stride in bytes - out : the sorted values on output - - For the value sort, - A and out may alias (A == out) exactly when stride == sizeof(T), - in which case heap sort is used for small sizes - - For the permutation sort, - the permutation can be both input (when start_perm!=0) and output, - following the convention that it is always at the start of the buffer buf; - the buffer will be expanded as necessary to accomodate the permutation - and the required scratch space - - ----------------------------------------------------------------------------*/ - -void sortv(T *out, const T *A, uint n, unsigned stride, buffer *restrict buf) -{ - if(nptr); - } - } - } else if(STATIC_DIGIT_BUCKETS) { - static uint count[DIGITS][DIGIT_VALUES]; - buffer_reserve(buf,n*sizeof(T)); - radix_sortv(out, A,n,stride, (T*)buf->ptr,count); - } else { - T *restrict work; - uint (*restrict count)[DIGIT_VALUES]; - const size_t count_off=align_as(uint,n*sizeof(T)); - buffer_reserve(buf,count_off+sizeof(uint[DIGITS][DIGIT_VALUES])); - work = buf->ptr; - count = (uint(*)[DIGIT_VALUES])((char*)buf->ptr+count_off); - radix_sortv(out, A,n,stride, work,count); - } -} - -uint *sortp(buffer *restrict buf, int start_perm, - const T *restrict A, uint n, unsigned stride) -{ - uint *restrict perm; - sort_data *restrict work; - size_t work_off=align_as(sort_data,n*sizeof(uint)); - if(nptr; - work = (sort_data*)((char*)buf->ptr+work_off); - if(n<2) { - if(n==1) *perm=0; - } else { - if(start_perm) merge_sortp (perm, A,n,stride, work); - else merge_sortp0(perm, A,n,stride, work); - } - } else if(STATIC_DIGIT_BUCKETS){ - static uint count[DIGITS][DIGIT_VALUES]; - buffer_reserve(buf,work_off+2*n*sizeof(sort_data)); - perm = buf->ptr; - work = (sort_data*)((char*)buf->ptr+work_off); - radix_sortp(perm,start_perm, A,n,stride, work,count); - } else { - uint (*restrict count)[DIGIT_VALUES]; - const size_t count_off=align_as(uint,work_off+2*n*sizeof(sort_data)); - buffer_reserve(buf,count_off+sizeof(uint[DIGITS][DIGIT_VALUES])); - perm = buf->ptr; - work = (sort_data*)((char*)buf->ptr+work_off); - count = (uint(*)[DIGIT_VALUES])((char*)buf->ptr+count_off); - radix_sortp(perm,start_perm, A,n,stride, work,count); - } - return perm; -} - -#undef STATIC_DIGIT_BUCKETS - -#undef DIGIT_BITS -#undef DIGIT_VALUES -#undef DIGIT_MASK -#undef CEILDIV -#undef DIGITS -#undef VALUE_BITS -#undef COUNT_SIZE - -#undef INDEX_PTR -#undef INC_PTR - -#undef sortp -#undef sortv - -#undef merge_sortp -#undef merge_sortp0 -#undef merge_sortv -#undef radix_sortp -#undef radix_passp_be -#undef radix_passp0_be -#undef radix_passp_e -#undef radix_passp_m -#undef radix_passp_b -#undef radix_passp0_b -#undef radix_sortv -#undef radix_passv -#undef radix_zeros -#undef radix_offsets -#undef radix_count -#undef sort_data - diff --git a/3rdParty/gslib/src/tensor.c b/3rdParty/gslib/src/tensor.c deleted file mode 100644 index a72471418..000000000 --- a/3rdParty/gslib/src/tensor.c +++ /dev/null @@ -1,82 +0,0 @@ -#include "c99.h" -#include "name.h" -#include "types.h" - -#if !defined(USE_CBLAS) - -#define tensor_dot PREFIXED_NAME(tensor_dot ) -#define tensor_mtxm PREFIXED_NAME(tensor_mtxm) - -/* Matrices are always column-major (FORTRAN style) */ - -double tensor_dot(const double *a, const double *b, uint n) -{ - double sum = 0; - for(;n;--n) sum += *a++ * *b++; - return sum; -} - -# if defined(USE_NAIVE_BLAS) -# define tensor_mxv PREFIXED_NAME(tensor_mxv ) -# define tensor_mtxv PREFIXED_NAME(tensor_mtxv) -# define tensor_mxm PREFIXED_NAME(tensor_mxm ) - -/* y = A x */ -void tensor_mxv( - double *restrict y, uint ny, - const double *restrict A, const double *restrict x, uint nx) -{ - uint i; - for(i=0;i -# define tensor_dot(a,b,n) cblas_ddot((int)(n),a,1,b,1) -# define tensor_mxv(y,ny,A,x,nx) \ - cblas_dgemv(CblasColMajor,CblasNoTrans,(int)ny,(int)nx, \ - 1.0,A,(int)ny,x,1,0.0,y,1) -# define tensor_mtxv(y,ny,A,x,nx) \ - cblas_dgemv(CblasColMajor,CblasTrans,(int)nx,(int)ny, \ - 1.0,A,(int)nx,x,1,0.0,y,1) -# define tensor_mxm(C,nc,A,na,B,nb) \ - cblas_dgemm(CblasColMajor,CblasNoTrans,CblasNoTrans, \ - (int)nc,(int)nb,(int)na,1.0, \ - A,(int)nc,B,(int)na,0.0,C,(int)nc) -# define tensor_mtxm(C,nc,A,na,B,nb) \ - cblas_dgemm(CblasColMajor,CblasTrans,CblasNoTrans, \ - (int)nc,(int)nb,(int)na,1.0, \ - A,(int)na,B,(int)na,0.0,C,(int)nc) -#else -# define tensor_dot PREFIXED_NAME(tensor_dot ) -# define tensor_mtxm PREFIXED_NAME(tensor_mtxm) -double tensor_dot(const double *a, const double *b, uint n); - -/* C (nc x nb) = [A (na x nc)]^T * B (na x nb); all column-major */ -void tensor_mtxm(double *C, uint nc, - const double *A, uint na, const double *B, uint nb); -# if defined(USE_NAIVE_BLAS) -# define tensor_mxv PREFIXED_NAME(tensor_mxv ) -# define tensor_mtxv PREFIXED_NAME(tensor_mtxv) -# define tensor_mxm PREFIXED_NAME(tensor_mxm ) -/* y = A x */ -void tensor_mxv(double *y, uint ny, const double *A, const double *x, uint nx); - -/* y = A^T x */ -void tensor_mtxv(double *y, uint ny, const double *A, const double *x, uint nx); - -/* C (nc x nb) = A (nc x na) * B (na x nb); all column-major */ -void tensor_mxm(double *C, uint nc, - const double *A, uint na, const double *B, uint nb); -# else -# define mxm FORTRAN_NAME(mxm,MXM) -/* C (na x nc) = A (na x nb) * B (nb x nc); all column-major */ -void mxm(const double *A, const uint *na, - const double *B, const uint *nb, - double *C, const uint *nc); -/* C (nc x nb) = A (nc x na) * B (na x nb); all column-major */ -static void tensor_mxm(double *C, uint nc, - const double *A, uint na, const double *B, uint nb) -{ mxm(A,&nc,B,&na,C,&nb); } - -/* y = A x */ -static void tensor_mxv(double *y, uint ny, - const double *A, const double *x, uint nx) -{ uint one=1; mxm(A,&ny,x,&nx,y,&one); } - -/* y = A^T x */ -static void tensor_mtxv(double *y, uint ny, - const double *A, const double *x, uint nx) -{ uint one=1; mxm(x,&one,A,&nx,y,&ny); } - -# endif -#endif - -/*-------------------------------------------------------------------------- - 1-,2-,3-d Tensor Application of Row Vectors (for Interpolation) - - the 3d case: - v = tensor_i3(Jr,nr, Js,ns, Jt,nt, u, work) - gives v = [ Jr (x) Js (x) Jt ] u - where Jr, Js, Jt are row vectors (interpolation weights) - u is nr x ns x nt in column-major format (inner index is r) - v is a scalar - --------------------------------------------------------------------------*/ - -static double tensor_i1(const double *Jr, uint nr, const double *u) -{ - return tensor_dot(Jr,u,nr); -} - -/* work holds ns doubles */ -static double tensor_i2(const double *Jr, uint nr, - const double *Js, uint ns, - const double *u, double *work) -{ - tensor_mtxv(work,ns, u, Jr,nr); - return tensor_dot(Js,work,ns); -} - -/* work holds ns*nt + nt doubles */ -static double tensor_i3(const double *Jr, uint nr, - const double *Js, uint ns, - const double *Jt, uint nt, - const double *u, double *work) -{ - double *work2 = work+nt; - tensor_mtxv(work2,ns*nt, u, Jr,nr); - tensor_mtxv(work ,nt , work2, Js,ns); - return tensor_dot(Jt,work,nt); -} - -/*-------------------------------------------------------------------------- - 1-,2-,3-d Tensor Application of Row Vectors - for simultaneous Interpolation and Gradient computation - - the 3d case: - v = tensor_ig3(g, wtr,nr, wts,ns, wtt,nt, u, work) - gives v = [ Jr (x) Js (x) Jt ] u - g_0 = [ Dr (x) Js (x) Jt ] u - g_1 = [ Jr (x) Ds (x) Jt ] u - g_2 = [ Jr (x) Js (x) Dt ] u - where Jr,Dr,Js,Ds,Jt,Dt are row vectors, - Jr=wtr, Dr=wtr+nr, etc. - (interpolation & derivative weights) - u is nr x ns x nt in column-major format (inner index is r) - v is a scalar, g is an array of 3 doubles - --------------------------------------------------------------------------*/ - -static double tensor_ig1(double g[1], - const double *wtr, uint nr, - const double *u) -{ - g[0] = tensor_dot(wtr+nr,u,nr); - return tensor_dot(wtr ,u,nr); -} - -/* work holds 2*nr doubles */ -static double tensor_ig2(double g[2], - const double *wtr, uint nr, - const double *wts, uint ns, - const double *u, double *work) -{ - tensor_mxm(work,nr, u,ns, wts,2); - g[0] = tensor_dot(wtr+nr,work ,nr); - g[1] = tensor_dot(wtr ,work+nr,nr); - return tensor_dot(wtr ,work ,nr); -} - -/* work holds 2*nr*ns + 3*nr doubles */ -static double tensor_ig3(double g[3], - const double *wtr, uint nr, - const double *wts, uint ns, - const double *wtt, uint nt, - const double *u, double *work) -{ - const uint nrs = nr*ns; - double *a = work, *b = work+2*nrs, *c=b+2*nr; - tensor_mxm(a,nrs, u,nt, wtt,2); - tensor_mxm(b,nr, a,ns, wts,2); - tensor_mxv(c,nr, a+nrs, wts,ns); - g[0] = tensor_dot(b , wtr+nr, nr); - g[1] = tensor_dot(b+nr, wtr , nr); - g[2] = tensor_dot(c , wtr , nr); - return tensor_dot(b , wtr , nr); -} - -/* - out - nr x ns - u - mr x ms - Jrt - mr x nr, Jst - ms x ns - work - nr x ms -*/ -static void tensor_2t(double *out, - const double *Jrt, uint nr, uint mr, - const double *Jst, uint ns, uint ms, - const double *u, double *work) -{ - tensor_mtxm(work,nr, Jrt,mr, u,ms); - tensor_mxm(out,nr, work,ms, Jst,ns); -} - -/* - out - nr x ns x nt - u - mr x ms x mt - Jrt - mr x nr, Jst - ms x ns, Jtt - mt x nt - work - nr*ms*mt + nr*ns*mt = nr*(ms+ns)*mt -*/ -static void tensor_3t(double *out, - const double *Jrt, uint nr, uint mr, - const double *Jst, uint ns, uint ms, - const double *Jtt, uint nt, uint mt, - const double *u, double *work) -{ - const uint nrs=nr*ns, mst=ms*mt, nrms=nr*ms; - uint k; - double *work2 = work+nr*mst; - double *p; const double *q; - tensor_mtxm(work,nr, Jrt,mr, u,mst); - for(k=0,p=work2,q=work;k - -/* - Define the integer types used throughout the code, - controlled by preprocessor macros. - - The integer type sint/uint (signed/unsigned) is used - most frequently, e.g., for indexing into local arrays, - and for processor ids. It can be one of - - macro sint/uint type - - (default) int - USE_LONG long - USE_LONG_LONG long long - - The slong/ulong type is used in relatively few places - for global identifiers and indices. It can be one of - - macro slong/ulong type - - (default) int - GLOBAL_LONG long - GLOBAL_LONG_LONG long long - - Since the long long type is not ISO C90, it is never - used unless explicitly asked for. - - The POSIX-standard limits.h header provides the - LLONG_MAX and LLONG_MIN macros, which will be - preferentially used. - -*/ - -#if defined(USE_LONG_LONG) || defined(GLOBAL_LONG_LONG) -typedef long long long_long; -# define WHEN_LONG_LONG(x) x -# if !defined(LLONG_MAX) -# if defined(LONG_LONG_MAX) -# define LLONG_MAX LONG_LONG_MAX -# else -# define LLONG_MAX 9223372036854775807 -# endif -# endif -# if !defined(LLONG_MIN) -# if defined(LONG_LONG_MIN) -# define LLONG_MIN LONG_LONG_MIN -# else -# define LLONG_MIN -9223372036854775807 -# endif -# endif -#else -# define WHEN_LONG_LONG(x) -#endif - -#if !defined(USE_LONG) && !defined(USE_LONG_LONG) -# define TYPE_LOCAL(i,l,ll) i -#elif defined(USE_LONG) -# define TYPE_LOCAL(i,l,ll) l -#elif defined(USE_LONG_LONG) -# define TYPE_LOCAL(i,l,ll) ll -#endif - -#if !defined(GLOBAL_LONG) && !defined(GLOBAL_LONG_LONG) -# define TYPE_GLOBAL(i,l,ll) i -#elif defined(GLOBAL_LONG) -# define TYPE_GLOBAL(i,l,ll) l -#else -# define TYPE_GLOBAL(i,l,ll) ll -#endif - -/* local integer type: for quantities O(N/P) */ -#define sint signed TYPE_LOCAL(int,long,long long) -#define uint unsigned TYPE_LOCAL(int,long,long long) -#define iabs TYPE_LOCAL(abs,labs,llabs) - -/* global integer type: for quantities O(N) */ -#define slong signed TYPE_GLOBAL(int,long,long long) -#define ulong unsigned TYPE_GLOBAL(int,long,long long) -#define iabsl TYPE_GLOBAL(abs,labs,llabs) - -#endif - diff --git a/3rdParty/gslib/tests/comm_test.c b/3rdParty/gslib/tests/comm_test.c deleted file mode 100644 index 49b1af0f8..000000000 --- a/3rdParty/gslib/tests/comm_test.c +++ /dev/null @@ -1,37 +0,0 @@ -#include -#include -#include -#include "name.h" -#include "fail.h" -#include "types.h" -#include "gs_defs.h" -#include "comm.h" - -int main(int narg, char *arg[]) -{ - comm_ext world; int np; - struct comm comm; - ulong sum[2],r[2],v, test; -#ifdef MPI - MPI_Init(&narg,&arg); - world = MPI_COMM_WORLD; - MPI_Comm_size(world,&np); -#else - world=0, np=1; -#endif - - comm_init(&comm,world); - - v = comm.id+1; - test = comm_reduce_slong(&comm,gs_add,(slong*)&v,1); - comm_scan(sum, &comm,gs_slong,gs_add, &v,1, r); - printf("%02d: %d %d %d\n",(int)comm.id,(int)sum[0],(int)sum[1],(int)test); - - comm_free(&comm); - -#ifdef MPI - MPI_Finalize(); -#endif - - return 0; -} diff --git a/3rdParty/gslib/tests/crystal_test.c b/3rdParty/gslib/tests/crystal_test.c deleted file mode 100644 index c7f50df64..000000000 --- a/3rdParty/gslib/tests/crystal_test.c +++ /dev/null @@ -1,88 +0,0 @@ -#include -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#include "comm.h" -#include "mem.h" -#include "crystal.h" - -int main(int narg, char *arg[]) -{ - comm_ext world; int np; - struct comm comm; - struct crystal cr; - uint i,sum, *data, *end; -#ifdef MPI - MPI_Init(&narg,&arg); - world = MPI_COMM_WORLD; - MPI_Comm_size(world,&np); -#else - world=0, np=1; -#endif - - comm_init(&comm,world); - - crystal_init(&cr,&comm); - - cr.data.n = (4+(comm.id&1))*comm.np; - buffer_reserve(&cr.data,cr.data.n*sizeof(uint)); - data = cr.data.ptr; - for(i=0;i %u:",data[1],data[0]); - for(i=0;i -#include -#include -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "types.h" -#include "fail.h" -#include "mem.h" -#include "poly.h" -#include "findpts_el.h" - -#define NR 14 -#define NS 7 - -static const unsigned nr[3]={NR,NS}; - -static double elx[NR*NS], ely[NR*NS]; -static const double *const elx2[2] = {elx,ely}; - -int main() -{ - int pass=1; - unsigned i,j; - double zr[NR], zs[NS]; - struct findpts_el_data_2 fd; - struct findpts_el_pt_2 *pt; - findpts_el_setup_2(&fd,nr,NR*NS); - pt = findpts_el_points_2(&fd); - - lobatto_nodes(zr,NR); - lobatto_nodes(zs,NS); - - for(j=0;jx[0] = zr[i]*2, p->x[1] = zs[j]*2; - p->r[0] = 0, p->r[1] = 0; - p->flags = 0; - } - - findpts_el_2(&fd, NR*NS, 1024*DBL_EPSILON); - - for(j=0;jx[0],p->x[1], p->r[0],p->r[1], - p->flags, p->dist2); - #define CLAMP(x,r) \ - do { double temp=r; x = temp<-1?-1:(temp>1?1:temp); } while(0) - CLAMP(r,zr[i]*2); CLAMP(s,zs[j]*2); - #undef CLAMP - if( fabs(r-p->r[0])+fabs(s-p->r[1]) > 1024*DBL_EPSILON ) - { printf("off by %g\n", fabs(r-p->r[0])+fabs(s-p->r[1])); - pass=0; goto fin; } - } - -fin: - - findpts_el_free_2(&fd); - - printf("Tests %s\n", pass?"passed":"failed"); - - return 0; -} diff --git a/3rdParty/gslib/tests/findpts_el_2_test2.c b/3rdParty/gslib/tests/findpts_el_2_test2.c deleted file mode 100644 index 6942da0c2..000000000 --- a/3rdParty/gslib/tests/findpts_el_2_test2.c +++ /dev/null @@ -1,97 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "c99.h" -#include "types.h" -#include "name.h" -#include "fail.h" -#include "mem.h" -#include "tensor.h" -#include "poly.h" -#include "lob_bnd.h" -#include "obbox.h" -#include "findpts_el.h" -#include "rand_elt_test.h" - -#define REPEAT 10000 - -#define NR 7 -#define TNR 8 -#define NS 8 -#define TNS 9 -#define TNTOT (TNR*TNS) -#define MR (4*NR) -#define MS (4*NS) - -static const unsigned nr[2] = {NR,NS}; - -/* #define NPT 1 */ -#define NPT 256 -/* #define NPT TNR*TNS */ - -#define TOL 1024*DBL_EPSILON - -static double zr[NR], zs[NS]; -static double tzr[TNR], tzs[TNS]; -static double Jr[NR*TNR],Js[NS*TNS]; -static double elx[NR*NS], ely[NR*NS]; -static const double *const elxy[2] = {elx,ely}; -static double telx[2][TNR*TNS]; -static double work[TNR*NS]; - -int main() -{ - int failure=0, unconv=0; - unsigned n,i,ie; - - struct findpts_el_data_2 fd; - struct findpts_el_pt_2 *pt; - findpts_el_setup_2(&fd,nr,NPT); - pt = findpts_el_points_2(&fd); - - lobatto_nodes(tzr,TNR), lobatto_nodes(tzs,TNS); - lobatto_nodes(zr,NR), lobatto_nodes(zs,NS); - - for(i=0;iTNTOT ? TNTOT : ie; - for(;i!=ie;++i) { - struct findpts_el_pt_2 *p = pt+(i-i0); - const double x=telx[0][i],y=telx[1][i]; - p->x[0]=x,p->x[1]=y; - p->flags = 0; - } - findpts_el_2(&fd, ie-i0, 1024*DBL_EPSILON); - for(i=i0;i!=ie;++i) { - struct findpts_el_pt_2 *p = pt+(i-i0); - const double r=tzr[i%TNR], s=tzs[i/TNR]; - if((p->flags&(1u<<4))==0) ++unconv; - if(fabs(p->r[0]-r)+fabs(p->r[1]-s)>1024*DBL_EPSILON) { - printf("found (%g,%g) for (%g,%g) ; error (%g,%g)\n", - p->r[0],p->r[1], r,s, p->r[0]-r,p->r[1]-s); - printf("(%g,%g) for (%.15g,%.15g) ; dist2 = %g\n", - p->x[0],p->x[1], - telx[0][i],telx[1][i],p->dist2); - ++failure; - } - } - } - } - - findpts_el_free_2(&fd); - - printf("%u failed points (out of %u)\n", failure, REPEAT*TNTOT); - printf("%u unconverged points\n", unconv); - - return !(failure == 39); -} diff --git a/3rdParty/gslib/tests/findpts_el_3_test.c b/3rdParty/gslib/tests/findpts_el_3_test.c deleted file mode 100644 index 54431c339..000000000 --- a/3rdParty/gslib/tests/findpts_el_3_test.c +++ /dev/null @@ -1,77 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "types.h" -#include "fail.h" -#include "mem.h" -#include "poly.h" -#include "findpts_el.h" - -#define NR 14 -#define NS 7 -#define NT 25 - -static const unsigned nr[3]={NR,NS,NT}; - -static double elx[NR*NS*NT], ely[NR*NS*NT], elz[NR*NS*NT]; -static const double *const elx3[3] = {elx,ely,elz}; - -int main() -{ - int pass=1; - unsigned i,j,k; - double zr[NR], zs[NS], zt[NT]; - struct findpts_el_data_3 fd; - struct findpts_el_pt_3 *pt; - findpts_el_setup_3(&fd,nr,NR*NS*NT); - pt = findpts_el_points_3(&fd); - - lobatto_nodes(zr,NR); - lobatto_nodes(zs,NS); - lobatto_nodes(zt,NT); - - for(k=0;kx[0] = zr[i]*2, p->x[1] = zs[j]*2, p->x[2] = zt[k]*2; - p->r[0] = 0, p->r[1] = 0, p->r[2] = 0; - p->flags = 0; - } - - findpts_el_3(&fd, NR*NS*NT, 1024*DBL_EPSILON); - /* sort_points(pt,NR*NS*NT); */ - - for(k=0;kx[0],p->x[1],p->x[2], p->r[0],p->r[1],p->r[2], - p->flags, p->dist2); - #define CLAMP(x,r) \ - do { double temp=r; x = temp<-1?-1:(temp>1?1:temp); } while(0) - CLAMP(r,zr[i]*2); CLAMP(s,zs[j]*2); CLAMP(t,zt[k]*2); - #undef CLAMP - if( fabs(r-p->r[0])+fabs(s-p->r[1])+fabs(t-p->r[2]) > 1024*DBL_EPSILON ) - { printf("off by %g\n", fabs(r-p->r[0])+fabs(s-p->r[1])+fabs(t-p->r[2])); - pass=0; goto fin; } - } - -fin: - - findpts_el_free_3(&fd); - - printf("Tests %s\n", pass?"passed":"failed"); - - return 0; -} diff --git a/3rdParty/gslib/tests/findpts_el_3_test2.c b/3rdParty/gslib/tests/findpts_el_3_test2.c deleted file mode 100644 index 627b9c660..000000000 --- a/3rdParty/gslib/tests/findpts_el_3_test2.c +++ /dev/null @@ -1,107 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "c99.h" -#include "types.h" -#include "name.h" -#include "fail.h" -#include "mem.h" -#include "tensor.h" -#include "poly.h" -#include "lob_bnd.h" -#include "obbox.h" -#include "findpts_el.h" -#include "rand_elt_test.h" - -#define REPEAT 100 - -#define NR 7 -#define TNR 8 -#define NS 8 -#define TNS 9 -#define NT 9 -#define TNT 7 -#define TNTOT (TNR*TNS*TNT) -#define MR (4*NR) -#define MS (4*NS) -#define MT (4*NT) - -static const unsigned nr[3] = {NR,NS,NT}; - -/* #define NPT 1 */ -#define NPT 256 -/* #define NPT TNR*TNS*TNT */ - -#define TOL 1024*DBL_EPSILON - -static double zr[NR], zs[NS], zt[NT]; -static double tzr[TNR], tzs[TNS], tzt[TNT]; -static double Jr[NR*TNR],Js[NS*TNS],Jt[NT*TNT]; -static double elx[NR*NS*NT], ely[NR*NS*NT], elz[NR*NS*NT]; -static const double *const elxyz[3] = {elx,ely,elz}; -static double telx[3][TNR*TNS*TNT]; -static double work[TNR*(NS+TNS)*NT]; - -int main() -{ - int failure=0; - unsigned n,i,ie; - - int unconv=0; - - struct findpts_el_data_3 fd; - struct findpts_el_pt_3 *pt; - findpts_el_setup_3(&fd,nr,NPT); - pt = findpts_el_points_3(&fd); - - lobatto_nodes(tzr,TNR), lobatto_nodes(tzs,TNS), lobatto_nodes(tzt,TNT); - lobatto_nodes(zr,NR), lobatto_nodes(zs,NS), lobatto_nodes(zt,NT); - - for(i=0;iTNTOT ? TNTOT : ie; - for(;i!=ie;++i) { - struct findpts_el_pt_3 *p = pt+(i-i0); - const double x=telx[0][i],y=telx[1][i],z=telx[2][i]; - p->x[0]=x,p->x[1]=y,p->x[2]=z; - p->flags = 0; - } - findpts_el_3(&fd, ie-i0, 1024*DBL_EPSILON); - for(i=i0;i!=ie;++i) { - struct findpts_el_pt_3 *p = pt+(i-i0); - const double r=tzr[i%TNR], s=tzs[(i/TNR)%TNS], t=tzt[i/(TNR*TNS)]; - if((p->flags&(1u<<6))==0) ++unconv; - if(fabs(p->r[0]-r)+fabs(p->r[1]-s)+fabs(p->r[2]-t)>1024*DBL_EPSILON) { - printf("found (%g,%g,%g) for (%g,%g,%g) ; error (%g,%g,%g)\n", - p->r[0],p->r[1],p->r[2], r,s,t, p->r[0]-r,p->r[1]-s,p->r[2]-t); - printf("(%g,%g,%g) for (%.15g,%.15g,%.15g) ; dist2 = %g\n", - p->x[0],p->x[1],p->x[2], - telx[0][i],telx[1][i],telx[2][i],p->dist2); - ++failure; - } - } - } - } - - findpts_el_free_3(&fd); - - printf("%u failed points (out of %u)\n", failure, (6+REPEAT)*TNTOT); - printf("%u unconverged points\n", unconv); - - return 0; -} diff --git a/3rdParty/gslib/tests/findpts_local_test.c b/3rdParty/gslib/tests/findpts_local_test.c deleted file mode 100644 index 0ebe144df..000000000 --- a/3rdParty/gslib/tests/findpts_local_test.c +++ /dev/null @@ -1,210 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "mem.h" -#include "types.h" -#include "poly.h" -#include "obbox.h" -#include "findpts_el.h" -#include "findpts_local.h" -#include "rand_elt_test.h" - -#define D 3 - -#if D==3 -#define INITD(a,b,c) {a,b,c} -#define MULD(a,b,c) ((a)*(b)*(c)) -#define INDEXD(a,na, b,nb, c) (((c)*(nb)+(b))*(na)+(a)) -#define findpts_local_data findpts_local_data_3 -#define findpts_local_setup findpts_local_setup_3 -#define findpts_local_free findpts_local_free_3 -#define findpts_local findpts_local_3 -#elif D==2 -#define INITD(a,b,c) {a,b} -#define MULD(a,b,c) ((a)*(b)) -#define INDEXD(a,na, b,nb, c) ((b)*(na)+(a)) -#define findpts_local_data findpts_local_data_2 -#define findpts_local_setup findpts_local_setup_2 -#define findpts_local_free findpts_local_free_2 -#define findpts_local findpts_local_2 -#endif - -#define NR 5 -#define NS 8 -#define NT 6 -#define K 4 -#define NEL MULD(K,K,K) -#define TN 4 - -#define NPT_MAX 256 -#define BBOX_TOL 0.01 -#define NEWT_TOL 1024*DBL_EPSILON -#define MAX_HASH_SIZE NEL*MULD(NR,NS,NT) - -/* -#define NPT_MAX 256 -#define BBOX_TOL 1.00 -#define NEWT_TOL 1024*DBL_EPSILON -#define MAX_HASH_SIZE NEL*3 -*/ - -static const unsigned nr[D] = INITD(NR,NS,NT); -static const unsigned mr[D] = INITD(4*NR,4*NS,4*NT); -static double zr[NR], zs[NS], zt[NT]; -static double x3[D][MULD(3,3,3)]; -static double mesh[D][NEL*MULD(NR,NS,NT)]; -static const double *const elx[D] = INITD(mesh[0],mesh[1],mesh[2]); - -static double testx[NEL*MULD(TN,TN,TN)*D]; -struct pt_data { double r[D], dist2; uint code, el; }; -static struct pt_data testp[NEL*MULD(TN,TN,TN)]; - -static double quad_eval(const double coef[MULD(3,3,3)], const double r[D]) -{ - double lr0[D], lr1[D], lr2[D]; - unsigned d; - for(d=0;ddist2max?testp[i].dist2:dist2max; - if(testp[i].code==2) ++notfound; - } - printf("Maximum distance = %g\n%u points not found\n", - sqrt(dist2max), (unsigned)notfound); -} - -static void test(buffer *buf) -{ - const double *const x_base[D]=INITD(testx,testx+1,testx+2); - const unsigned x_stride[D]= - INITD(D*sizeof(double),D*sizeof(double),D*sizeof(double)); - struct findpts_local_data fld; - rand_mesh(); - test_mesh(); - findpts_local_setup(&fld,elx,nr,NEL,mr,BBOX_TOL,MAX_HASH_SIZE, - NPT_MAX,NEWT_TOL); - findpts_local(&testp[0].code , sizeof(struct pt_data), - &testp[0].el , sizeof(struct pt_data), - testp[0].r , sizeof(struct pt_data), - &testp[0].dist2, sizeof(struct pt_data), - x_base, x_stride, - NEL*MULD(TN,TN,TN), &fld, buf); - findpts_local_free(&fld); - print_ptdata(); -} - -int main() -{ - buffer buf = null_buffer; - lobatto_nodes(zr,NR),lobatto_nodes(zs,NS),lobatto_nodes(zt,NT); - test(&buf); - buffer_free(&buf); - return 0; -} diff --git a/3rdParty/gslib/tests/findpts_test.c b/3rdParty/gslib/tests/findpts_test.c deleted file mode 100644 index ad9638228..000000000 --- a/3rdParty/gslib/tests/findpts_test.c +++ /dev/null @@ -1,328 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#include "mem.h" -#include "poly.h" -#include "gs_defs.h" -#include "comm.h" -#include "rand_elt_test.h" -#include "findpts.h" -#include "crystal.h" -#include "sarray_transfer.h" - -#define D 3 - -#if D==3 -#define INITD(a,b,c) {a,b,c} -#define MULD(a,b,c) ((a)*(b)*(c)) -#define INDEXD(a,na, b,nb, c) (((c)*(nb)+(b))*(na)+(a)) -#define findpts_data findpts_data_3 -#define findpts_setup findpts_setup_3 -#define findpts_free findpts_free_3 -#define findpts findpts_3 -#define findpts_eval findpts_eval_3 -#elif D==2 -#define INITD(a,b,c) {a,b} -#define MULD(a,b,c) ((a)*(b)) -#define INDEXD(a,na, b,nb, c) ((b)*(na)+(a)) -#define findpts_data findpts_data_2 -#define findpts_setup findpts_setup_2 -#define findpts_free findpts_free_2 -#define findpts findpts_2 -#define findpts_eval findpts_eval_2 -#endif - -#define NR 5 -#define NS 7 -#define NT 6 -#define K 4 -#define NEL MULD(K,K,K) -#define TN 4 - -#define NPT_MAX 256 -#define BBOX_TOL 0.01 -#define NEWT_TOL 1024*DBL_EPSILON -#define LOC_HASH_SIZE NEL*MULD(NR,NS,NT) -#define GBL_HASH_SIZE NEL*MULD(NR,NS,NT) - -/* -#define NPT_MAX 256 -#define BBOX_TOL 1.00 -#define NEWT_TOL 1024*DBL_EPSILON -#define LOC_HASH_SIZE NEL*3 -#define GBL_HASH_SIZE NEL*3 -*/ - -static uint np, id; - -static const unsigned nr[D] = INITD(NR,NS,NT); -static const unsigned mr[D] = INITD(2*NR,2*NS,2*NT); -static double zr[NR], zs[NS], zt[NT]; -static double x3[D][MULD(3,3,3)]; -static double mesh[D][NEL*MULD(NR,NS,NT)]; -static const double *const elx[D] = INITD(mesh[0],mesh[1],mesh[2]); - -struct pt_data { double x[D], r[D], dist2, ex[D]; uint code, proc, el; }; -static struct array testp; - -static struct crystal cr; - -static double quad_eval(const double coef[MULD(3,3,3)], const double r[D]) -{ - double lr0[D], lr1[D], lr2[D]; - unsigned d; - for(d=0;dproc = rand()%np; - out->x[0] = quad_eval(x3[0],r); - out->x[1] = quad_eval(x3[1],r); - #if D==3 - out->x[2] = quad_eval(x3[2],r); - #endif - ++out; - }} - #if D==3 - } - #endif - } - sarray_transfer(struct pt_data,&testp,proc,1,&cr); - if(0) - printf("%u: %u shuffled points\n",id,(unsigned)testp.n); -} - -static void print_ptdata(const struct comm *const comm) -{ - uint notfound=0; - double dist2max=0, ed2max=0; - const struct pt_data *pt = testp.ptr, *const end = pt+testp.n; - for(;pt!=end;++pt) { - if(0&&id==0) - printf("code=%u, proc=%u, el=%u, dist2=%g, r=(%.17g,%.17g" - #if D==3 - ",%.17g" - #endif - "), " - "x=(%.17g,%.17g" - #if D==3 - ",%.17g" - #endif - "), ex=(%.17g,%.17g" - #if D==3 - ",%.17g" - #endif - ")\n", - pt->code,pt->proc,pt->el,pt->dist2, - pt->r[0],pt->r[1], - #if D==3 - pt->r[2], - #endif - pt->x[0],pt->x[1], - #if D==3 - pt->x[2], - #endif - pt->ex[0],pt->ex[1] - #if D==3 - ,pt->ex[2] - #endif - ); - if(pt->code==2) ++notfound; - else { - double ed2=0, dx; - unsigned d; for(d=0;dx[d]-pt->ex[d], ed2+=dx*dx; - dist2max=pt->dist2>dist2max?pt->dist2:dist2max; - ed2max=ed2>ed2max?ed2:ed2max; - } - } - { - double distmax=sqrt(dist2max), edmax=sqrt(ed2max); - slong total=testp.n; - if(0) - printf("%u: maximum distance = %g (adv), %g (eval);" - " %u/%u points not found\n", - (unsigned)id, distmax, edmax, - (unsigned)notfound, (unsigned)testp.n); - distmax = comm_reduce_double(comm,gs_max,&distmax,1); - edmax = comm_reduce_double(comm,gs_max,&edmax ,1); - notfound = comm_reduce_sint(comm,gs_add,(sint*)¬found,1); - total = comm_reduce_slong(comm,gs_add,&total,1); - if(id==0) - printf("maximum distance = %g (adv), %g (eval);" - " %u/%lu points not found\n", - distmax, edmax, - (unsigned)notfound, (unsigned long)total); - } -} - -static void test(const struct comm *const comm) -{ - const double *x_base[D]; - const unsigned x_stride[D] = INITD(sizeof(struct pt_data), - sizeof(struct pt_data), - sizeof(struct pt_data)); - struct findpts_data *fd; - struct pt_data *pt; - unsigned d; - if(id==0) printf("Initializing mesh\n"); - rand_mesh(); - test_mesh(); - pt = testp.ptr; - if(id==0) printf("calling findpts_setup\n"); - fd=findpts_setup(comm,elx,nr,NEL,mr,BBOX_TOL, - LOC_HASH_SIZE,GBL_HASH_SIZE, - NPT_MAX,NEWT_TOL); - if(id==0) printf("calling findpts\n"); - x_base[0]=pt->x, x_base[1]=pt->x+1; - #if D==3 - x_base[2]=pt->x+2; - #endif - findpts(&pt->code , sizeof(struct pt_data), - &pt->proc , sizeof(struct pt_data), - &pt->el , sizeof(struct pt_data), - pt->r , sizeof(struct pt_data), - &pt->dist2, sizeof(struct pt_data), - x_base , x_stride, testp.n, fd); - for(d=0;dex[d], sizeof(struct pt_data), - &pt->code , sizeof(struct pt_data), - &pt->proc , sizeof(struct pt_data), - &pt->el , sizeof(struct pt_data), - pt->r , sizeof(struct pt_data), - testp.n, mesh[d], fd); - } - findpts_free(fd); - print_ptdata(comm); -} - -int main(int narg, char *arg[]) -{ - comm_ext world; - struct comm comm; - -#ifdef MPI - MPI_Init(&narg,&arg); - world = MPI_COMM_WORLD; -#else - world=0; -#endif - - comm_init(&comm,world); - id=comm.id, np=comm.np; - - lobatto_nodes(zr,NR),lobatto_nodes(zs,NS),lobatto_nodes(zt,NT); - array_init(struct pt_data,&testp,NEL*MULD(TN,TN,TN)); - crystal_init(&cr,&comm); - test(&comm); - crystal_free(&cr); - array_free(&testp); - - comm_free(&comm); - -#ifdef MPI - MPI_Finalize(); -#endif - - return 0; -} diff --git a/3rdParty/gslib/tests/fortran/f-igs.f b/3rdParty/gslib/tests/fortran/f-igs.f deleted file mode 100644 index 6cff51374..000000000 --- a/3rdParty/gslib/tests/fortran/f-igs.f +++ /dev/null @@ -1,59 +0,0 @@ - program figs - implicit none - - include 'mpif.h' - - integer npmax - parameter(npmax=16) - - integer ierror,handle,hwait,np,me,i,neighbors,count - integer*8 id(npmax) - - real*8 answer(npmax),u(npmax) - - call mpi_init(ierror) - call mpi_comm_size(mpi_comm_world,np,ierror) - call mpi_comm_rank(mpi_comm_world,me,ierror) - - count=1 - if(me.gt.0) then - id(count)=me - count=count+1 - endif - id(count)=me+1 - count=count+1 - if(me.lt.(np-1)) then - id(count)=me+2 - count=count+1 - endif - - neighbors=count-1 -! gs_pairwise - call gs_setup_pick(handle,id,neighbors,mpi_comm_world,np,1) - - if(np.eq.1) then - answer(1)=1.0 - else - answer(1)=2.0 - answer(np)=2.0 - do i=2,np-1 - answer(i)=3.0 - enddo - endif - - do i=1,neighbors - u(i)=1.0 - enddo - - call igs_op(handle,u,1,1,0,hwait) - call gs_op_wait(hwait) - - do i=1,neighbors - if(abs(u(i)-answer(id(i)))>1e-16) then - write(6,*) 'igs_op test failed' - endif - enddo - - call mpi_finalize(ierror) - - end diff --git a/3rdParty/gslib/tests/gs_test.c b/3rdParty/gslib/tests/gs_test.c deleted file mode 100644 index 1d0e948c7..000000000 --- a/3rdParty/gslib/tests/gs_test.c +++ /dev/null @@ -1,133 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#include "comm.h" -#include "mem.h" -#include "gs_defs.h" -#include "gs.h" - -typedef double T; -const gs_dom dom = gs_double; - -static void test(const struct comm *comm, gs_method method) -{ - struct gs_data *gsh; - const uint np = comm->np; - slong *id = tmalloc(slong,np+4); - T *v = tmalloc(T,np+4); - uint i; - id[0] = -(slong)(np+10+3*comm->id); - for(i=0;iid+1; - id[np+2] = comm->id+1; - id[np+3] = np-comm->id; - gsh = gs_setup(id,np+4,comm,0,method,1); - free(id); - - /* non-blocking api - original test */ - if(comm->id==0) printf("\nTesting non-blocking api ...\n"); - for(i=0;iid==0) for(i=0;iid==0) printf("\n"); - - for(i=0;iid==0) for(i=0;iid==0) printf("\nTesting blocking api ...\n"); - for(i=0;iid==0) for(i=0;iid==0) printf("\n"); - - for(i=0;iid==0) for(i=0;iid; - uint count=0; - if(me>0) id1[count++]=me; - id1[count++]=me+1; - if(me -#include -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#include "comm.h" -#include "mem.h" -#include "gs_defs.h" -#include "gs.h" - -struct gs_data *gop_handle; -int np; - -//------------------------------------------------------------------------------ -void gop_init(struct comm *gop_comm, comm_ext world) { - comm_init(gop_comm, world); - - const long long gop_id = 1; - - gop_handle = gs_setup(&gop_id, 1, gop_comm, 0, gs_auto, 0); -} -//------------------------------------------------------------------------------ -void gop(void *u, gs_dom dom, gs_op op, unsigned transpose) { - gs(u, dom, op, transpose, gop_handle, NULL); -} -//------------------------------------------------------------------------------ -void gop_free(struct comm* gop_comm) { - comm_free(gop_comm); - - gs_free(gop_handle); -} -//------------------------------------------------------------------------------ -int test_min(int rank) { - int min = rank; - gop(&min, gs_int, gs_min, 0); - - if (rank == 0) printf("\ngop min test: "); - if (min == 0) { - if (rank == 0) printf("[Passed]"); - return 0; - } else { - if (rank == 0) printf("[Failed]"); - return 1; - } -} -//------------------------------------------------------------------------------ -int test_max(int rank) { - int max = rank; - gop(&max, gs_int, gs_max, 0); - - if (rank == 0) printf("\ngop max test: "); - if (max == np-1) { - if (rank == 0) printf("[Passed]"); - return 0; - } else { - if (rank == 0) printf("[Failed]"); - return 1; - } -} -//------------------------------------------------------------------------------ -int test_add(int rank) { - int sum = rank; - gop(&sum, gs_int, gs_add, 0); - sum *= 2; - - if (rank == 0) printf("\ngop add test: "); - if (sum == np*(np-1)) { - if (rank == 0) printf("[Passed]"); - return 0; - } else { - if (rank == 0) printf("[Failed]"); - return 1; - } -} -//------------------------------------------------------------------------------ -int main(int narg, char *arg[]) -{ - comm_ext world; int rank, result; - struct comm comm; - -#ifdef MPI - MPI_Init(&narg,&arg); - world = MPI_COMM_WORLD; - MPI_Comm_size(world,&np); - MPI_Comm_rank(world,&rank); -#else - world=0, np=1; rank = 0; -#endif - - gop_init(&comm,world); - - result = test_min(rank); - result += test_max(rank); - result += test_add(rank); - - gop_free(&comm); - -#ifdef MPI - MPI_Finalize(); -#endif - - return result; -} diff --git a/3rdParty/gslib/tests/gs_test_gop_nonblocking.c b/3rdParty/gslib/tests/gs_test_gop_nonblocking.c deleted file mode 100644 index b1c2a7057..000000000 --- a/3rdParty/gslib/tests/gs_test_gop_nonblocking.c +++ /dev/null @@ -1,131 +0,0 @@ -#include -#include -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#include "comm.h" -#include "mem.h" -#include "gs_defs.h" -#include "gs.h" - -struct gs_data *gop_handle; -int np; - -//------------------------------------------------------------------------------ -void gop_init(struct comm *gop_comm, comm_ext world) { - comm_init(gop_comm, world); - - const long long gop_id = 1; - - gop_handle = gs_setup(&gop_id, 1, gop_comm, 0, gs_pairwise, 0); -} -//------------------------------------------------------------------------------ -void igop(void *u, gs_dom dom, gs_op op, unsigned transpose) { - // In a real case, these calls will be split across other code - int handle; - igs(u, dom, op, transpose, gop_handle, NULL, &handle); - gs_wait (handle); -} -//------------------------------------------------------------------------------ -void gop_free(struct comm* gop_comm) { - comm_free(gop_comm); - - gs_free(gop_handle); -} -//------------------------------------------------------------------------------ -int test_imin(int rank) { - int min = rank; - igop(&min, gs_int, gs_min, 0); - - if (rank == 0) printf("\ngop min test: "); - if (min == 0) { - if (rank == 0) printf("[Passed]"); - return 0; - } else { - if (rank == 0) printf("[Failed]"); - return 1; - } -} -//------------------------------------------------------------------------------ -int test_imax(int rank) { - int max = rank; - igop(&max, gs_int, gs_max, 0); - - if (rank == 0) printf("\ngop max test: "); - if (max == np-1) { - if (rank == 0) printf("[Passed]"); - return 0; - } else { - if (rank == 0) printf("[Failed]"); - return 1; - } -} -//------------------------------------------------------------------------------ -int test_iadd(int rank) { - int sum = rank; - igop(&sum, gs_int, gs_add, 0); - sum *= 2; - - if (rank == 0) printf("\ngop add test: "); - if (sum == np*(np-1)) { - if (rank == 0) printf("[Passed]"); - return 0; - } else { - if (rank == 0) printf("[Failed]"); - return 1; - } -} -//------------------------------------------------------------------------------ -int test_imul(int rank) { - int mul = rank + 1; - igop(&mul, gs_int, gs_mul, 0); - - int answer=1, i; - for(i = 2; i <= np; i++) { - answer*=i; - } - if (rank == 0) printf("\ngop mul test: "); - if (mul == answer) { - if (rank == 0) printf("[Passed]"); - return 0; - } else { - if (rank == 0) printf("[Failed]"); - return 1; - } -} -//------------------------------------------------------------------------------ -int main(int narg, char *arg[]) -{ - comm_ext world; int rank, result; - struct comm comm; - -#ifdef MPI - MPI_Init(&narg,&arg); - world = MPI_COMM_WORLD; - MPI_Comm_size(world,&np); - MPI_Comm_rank(world,&rank); -#else - world=0, np=1; rank = 0; -#endif - - gop_init(&comm,world); - - result = test_imin(rank); - result += test_imax(rank); - result += test_iadd(rank); - result += test_imul(rank); - - gop_free(&comm); - - if (rank == 0) printf("\n"); - -#ifdef MPI - MPI_Finalize(); -#endif - - return result; -} diff --git a/3rdParty/gslib/tests/gs_test_old.c b/3rdParty/gslib/tests/gs_test_old.c deleted file mode 100644 index f6143333e..000000000 --- a/3rdParty/gslib/tests/gs_test_old.c +++ /dev/null @@ -1,148 +0,0 @@ -/* simple stand-alone test for parallel gather-scatter routines - assumes gather-scatter routines were compiled with default names - can compile to sequential version if MPI is not defined - - the test is as follows, where N is the number of procs: - there are N physical nodes (vertices) - each proc has 2 local/virtual nodes mapping to each physical node, - for a total of 2*N*N virtual nodes - virtual nodes are given values that correspond to a sequential ordering - (so that they range from 0 to 2*N*N-1) - the addition operation is performed and the result is checked, - the correct result being known a priori - the addition operation is also checked, in a similar manner, for - both the cpgs_op_vec and cpgs_op_many routines with vector dimension 3 -*/ - -#include -#include -#include -#ifdef MPI -# include -#else - typedef void MPI_Comm; -#endif -#include "name.h" -#include "types.h" - -typedef long real; -sint datatype = 3; - -#define fgs_setup FORTRAN_NAME(gs_setup ,GS_SETUP ) -#define fgs_op FORTRAN_NAME(gs_op ,GS_OP ) -#define fgs_op_vec FORTRAN_NAME(gs_op_vec ,GS_OP_VEC ) -#define fgs_op_many FORTRAN_NAME(gs_op_many ,GS_OP_MANY ) -#define fgs_op_fields FORTRAN_NAME(gs_op_fields,GS_OP_FIELDS) -#define fgs_free FORTRAN_NAME(gs_free ,GS_FREE ) - -void fgs_setup(sint *handle, const slong id[], const sint *n, - const MPI_Fint *comm, const sint *np); -void fgs_op(const sint *handle, void *u, const sint *dom, const sint *op, - const sint *transpose); -void fgs_op_vec(const sint *handle, void *u, const sint *n, - const sint *dom, const sint *op, const sint *transpose); -void fgs_op_many(const sint *handle, void *u1, void *u2, void *u3, - void *u4, void *u5, void *u6, const sint *n, - const sint *dom, const sint *op, const sint *transpose); -void fgs_free(const sint *handle); - -void assert_is_zero(real v) -{ - if(fabs(v) < 1e-20) return; - printf("test failed\n"); - exit(1); -} - -int main(int narg, char* arg[]) -{ - sint transpose=0; - sint id=0,np=1; - sint i,handle,maxv=3; - real *u; - slong *glindex; -#ifndef MPI - int comm; -#else - MPI_Init(&narg,&arg); - MPI_Comm comm; - MPI_Comm_dup(MPI_COMM_WORLD,&comm); - MPI_Fint fcomm = MPI_Comm_c2f(comm); - { int i; - MPI_Comm_rank(comm,&i); id=i; - MPI_Comm_size(comm,&i); np=i; - } -#endif - - glindex = malloc(np*2*sizeof(slong)); - for(i=0;i -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#include "comm.h" -#include "mem.h" -#include "gs_defs.h" -#include "gs.h" - -static void test(const struct comm *comm) -{ - uint i,np=comm->np,id=comm->id; - slong *glindex = tmalloc(slong,np*2); - char *out, *buf = tmalloc(char,80+np*2*30); - struct gs_data *gsh; - - for(i=0;iid); - for(i=0;iid); - for(i=0;iid==0) printf("\nTesting non-blocking api ...\n"); - for(i=0;iid); - for(i=0;iid==0) printf("\nTesting blocking api ...\n"); - for(i=0;iid); - for(i=0;i -#include -#include -#include -#include -#include "c99.h" -#include "types.h" -#include "name.h" -#include "fail.h" -#include "mem.h" -#include "tensor.h" -#include "poly.h" -#include "lob_bnd.h" - - -#define RESFAC 4 -#define N 12 -#define NY 9 -#define NZ 4 -#define REPEAT 1000000 - -#define PI 3.1415926535897932384626433832795028841971693993751058209749445923 - - -int main() -{ - int failure=0; - uint i,r; - double p[NZ*NY*N]; - double lb[2*(RESFAC*NZ)*(RESFAC*NY)*(RESFAC*N)]; - double work[2*(RESFAC*N)*(RESFAC*NY)*(NZ+1)]; - - double *ld_N = tmalloc(double,N+gll_lag_size(N)); - lagrange_fun *lag_N = gll_lag_setup(ld_N+N,N); - - double *ld_NY= tmalloc(double,NY+gll_lag_size(NY)); - lagrange_fun *lag_NY = gll_lag_setup(ld_NY+NY,NY); - - double *ld_NZ= tmalloc(double,NZ+gll_lag_size(NZ)); - lagrange_fun *lag_NZ = gll_lag_setup(ld_NZ+NZ,NZ); - - double *lb_N = tmalloc(double,lob_bnd_size(N ,RESFAC*N )); - double *lb_NY = tmalloc(double,lob_bnd_size(NY,RESFAC*NY)); - double *lb_NZ = tmalloc(double,lob_bnd_size(NZ,RESFAC*NZ)); - lob_bnd_setup(lb_N , N ,RESFAC*N ); - lob_bnd_setup(lb_NY, NY,RESFAC*NY ); - lob_bnd_setup(lb_NZ, NZ,RESFAC*NZ ); - /*for(i=0;i -#include -#include -#include -#include -#include "c99.h" -#include "types.h" -#include "name.h" -#include "fail.h" -#include "mem.h" -#include "poly.h" -#include "lob_bnd.h" -#include "obbox.h" -#include "rand_elt_test.h" - -#define REPEAT 20 - -#define N 100 -#define NR 7 -#define MR (4*NR) -#define NS 8 -#define MS (4*NS) -#define NT 9 -#define MT (4*NT) - -#define TOL 0.00001 - -static const unsigned nr[3]={NR,NS,NT}, mr[3]={MR,MS,MT}; - -static double zr[NR], zs[NS], zt[NT]; -static double x[NR*NS*NT*N], y[NR*NS*NT*N], z[NR*NS*NT*N]; -static double tx[3][NR*NS*NT]; -static const double *const elx[3]={x,y,z}; - -static struct obbox_2 ob2[N*NT]; -static struct obbox_3 ob3[N]; - -static struct dbl_range dbl_range_expand(struct dbl_range b, double tol) -{ - double a = (b.min+b.max)/2, l = (b.max-b.min)*(1+tol)/2; - struct dbl_range m; - m.min = a-l, m.max = a+l; - return m; -} - -int main() -{ - int failure=0; - unsigned i; - - double *lob_bnd_data_r = tmalloc(double, - lob_bnd_size(NR,MR)+lob_bnd_size(NS,MS)+lob_bnd_size(NT,MT)), - *lob_bnd_data_s = lob_bnd_data_r + lob_bnd_size(NR,MR), - *lob_bnd_data_t = lob_bnd_data_s + lob_bnd_size(NS,MS); - - lobatto_nodes(zr,NR); lob_bnd_setup(lob_bnd_data_r,NR,MR); - lobatto_nodes(zs,NS); lob_bnd_setup(lob_bnd_data_s,NS,MS); - lobatto_nodes(zt,NT); lob_bnd_setup(lob_bnd_data_t,NT,MT); - - /* 2-D */ - for(i=0;ic0[0], dy=y_[j]-ob->c0[1]; - tx[0][j] = ob->A[0]*dx+ob->A[1]*dy; - tx[1][j] = ob->A[2]*dx+ob->A[3]*dy; - if( (x_[j]-ob->x[0].min)*(ob->x[0].max-x_[j]) < 0 - || (y_[j]-ob->x[1].min)*(ob->x[1].max-y_[j]) < 0 ) - failure=1, - printf("%d %d (%g,%g) not in [%g,%g] x [%g,%g]\n", n, j, - x_[j],y_[j], ob->x[0].min,ob->x[0].max, ob->x[1].min,ob->x[1].max); - if( (tx[0][j]+1)*(1-tx[0][j]) < 0 - || (tx[1][j]+1)*(1-tx[1][j]) < 0 ) - failure=1, - printf("%d %d (%g,%g) not in [-1,1]^2\n", n, j, - tx[0][j],tx[1][j]); - if(failure) break; - } - - xr = dbl_range_expand(lob_bnd_2(lob_bnd_data_r,NR,MR, - lob_bnd_data_s,NS,MS, x_, work), TOL); - yr = dbl_range_expand(lob_bnd_2(lob_bnd_data_r,NR,MR, - lob_bnd_data_s,NS,MS, y_, work), TOL); - - for(j=0;j<2;++j) tr[j] = dbl_range_expand( - lob_bnd_2(lob_bnd_data_r,NR,MR, lob_bnd_data_s,NS,MS, tx[j], work) - , TOL); - - if( ob->x[0].min < xr.min - DBL_EPSILON*128 - || ob->x[0].max > xr.max + DBL_EPSILON*128 ) failure = 1; - if( ob->x[1].min < yr.min - DBL_EPSILON*128 - || ob->x[1].max > yr.max + DBL_EPSILON*128 ) failure = 1; - - for(j=0;j<2;++j) - if( tr[j].min > -1 + DBL_EPSILON*128 - || tr[j].max < 1 - DBL_EPSILON*128 ) failure = 1; - - if((i==0&&n==0) || failure) { - printf("x: [%g,%g] in [%g,%g]\n", ob->x[0].min, ob->x[0].max, - xr.min, xr.max); - printf("y: [%g,%g] in [%g,%g]\n", ob->x[1].min, ob->x[1].max, - yr.min, yr.max); - for(j=0;j<2;++j) - printf("r %d: [%g,%g]\n", j, tr[j].min, tr[j].max); - } - if(failure) break; - } - if(failure) break; - printf("."); fflush(stdout); - } - printf("\n"); - - /* 3-D */ - for(i=0;!failure && ic0[0], dy=y_[j]-ob->c0[1], dz=z_[j]-ob->c0[2]; - tx[0][j] = ob->A[0]*dx+ob->A[1]*dy+ob->A[2]*dz; - tx[1][j] = ob->A[3]*dx+ob->A[4]*dy+ob->A[5]*dz; - tx[2][j] = ob->A[6]*dx+ob->A[7]*dy+ob->A[8]*dz; - if( (x_[j]-ob->x[0].min)*(ob->x[0].max-x_[j]) < 0 - || (y_[j]-ob->x[1].min)*(ob->x[1].max-y_[j]) < 0 - || (z_[j]-ob->x[2].min)*(ob->x[2].max-z_[j]) < 0 ) - failure=1, - printf("%d %d (%g,%g,%g) not in [%g,%g] x [%g,%g] x [%g,%g]\n", n, j, - x_[j],y_[j],z_[j], ob->x[0].min,ob->x[0].max, - ob->x[1].min,ob->x[1].max, ob->x[2].min,ob->x[2].max); - if( (tx[0][j]+1)*(1-tx[0][j]) < 0 - || (tx[1][j]+1)*(1-tx[1][j]) < 0 - || (tx[2][j]+1)*(1-tx[2][j]) < 0 ) - failure=1, - printf("%d %d (%g,%g,%g) not in [-1,1]^3\n", n, j, - tx[0][j],tx[1][j],tx[2][j]); - if(failure) break; - } - - xr = dbl_range_expand(lob_bnd_3(lob_bnd_data_r,NR,MR, - lob_bnd_data_s,NS,MS, - lob_bnd_data_t,NT,MT, x_, work), TOL); - yr = dbl_range_expand(lob_bnd_3(lob_bnd_data_r,NR,MR, - lob_bnd_data_s,NS,MS, - lob_bnd_data_t,NT,MT, y_, work), TOL); - zr = dbl_range_expand(lob_bnd_3(lob_bnd_data_r,NR,MR, - lob_bnd_data_s,NS,MS, - lob_bnd_data_t,NT,MT, z_, work), TOL); - - for(j=0;j<3;++j) tr[j] = dbl_range_expand( - lob_bnd_3(lob_bnd_data_r,NR,MR, lob_bnd_data_s,NS,MS, - lob_bnd_data_t,NT,MT, tx[j], work) - , TOL); - - if( ob->x[0].min < xr.min - DBL_EPSILON*128 - || ob->x[0].max > xr.max + DBL_EPSILON*128 ) failure = 1; - if( ob->x[1].min < yr.min - DBL_EPSILON*128 - || ob->x[1].max > yr.max + DBL_EPSILON*128 ) failure = 1; - if( ob->x[2].min < zr.min - DBL_EPSILON*128 - || ob->x[2].max > zr.max + DBL_EPSILON*128 ) failure = 1; - - for(j=0;j<3;++j) - if( tr[j].min > -1 + DBL_EPSILON*128 - || tr[j].max < 1 - DBL_EPSILON*128 ) failure = 1; - - if((i==0&&n==0) || failure) { - printf("x: [%g,%g] in [%g,%g]\n", ob->x[0].min, ob->x[0].max, - xr.min, xr.max); - printf("y: [%g,%g] in [%g,%g]\n", ob->x[1].min, ob->x[1].max, - yr.min, yr.max); - printf("z: [%g,%g] in [%g,%g]\n", ob->x[2].min, ob->x[2].max, - zr.min, zr.max); - for(j=0;j<3;++j) - printf("r %d: [%g,%g]\n", j, tr[j].min, tr[j].max); - } - if(failure) break; - } - if(failure) break; - printf("."); fflush(stdout); - } - printf("\n"); - - free(lob_bnd_data_r); - - printf("Tests %s\n", failure?"failed":"successful"); - - return failure; -} diff --git a/3rdParty/gslib/tests/poly_test.c b/3rdParty/gslib/tests/poly_test.c deleted file mode 100644 index b2caeef4a..000000000 --- a/3rdParty/gslib/tests/poly_test.c +++ /dev/null @@ -1,23 +0,0 @@ -#include -#include -#include "c99.h" -#include "name.h" -#include "types.h" -#include "poly.h" - -int main() -{ - int i, n=13; - double z[50], w[50]; - lobatto_quad(z,w,n); - /* - for(i=0;i> test_log - if [ "$?" -eq 0 ]; then - echo "Running test: $j, np: $n ... Passed." - else - echo "Running test: $j, np: $n ... Failed." - fi - done -done diff --git a/3rdParty/gslib/tests/sarray_sort_test.c b/3rdParty/gslib/tests/sarray_sort_test.c deleted file mode 100644 index 15fa780d9..000000000 --- a/3rdParty/gslib/tests/sarray_sort_test.c +++ /dev/null @@ -1,47 +0,0 @@ -#include -#include -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#include "mem.h" -#include "sort.h" -#include "sarray_sort.h" - -int main() -{ - struct rec { double d; slong l; sint i; float f; }; - buffer buf = {0,0,0}; - struct rec rec[500]; - uint i; - - for(i=0;i<500;++i) { - sint num1 = rand() & 0xff; - slong num2 = rand(); - num2<<=(CHAR_BIT)*sizeof(int)-1; - num2|=rand(); - num2<<=(CHAR_BIT)*sizeof(int)-1; - num2|=rand(); - num2= num2<0?-num2:num2; - rec[i].d = num2; - rec[i].f = num2; - rec[i].l = num2; - rec[i].i = num1; - } - sarray_sort_2(struct rec,rec,500, i,0, l,1, &buf); - for(i=0;i<500;++i) - printf("%g\t%g\t%ld\t%d\n", - rec[i].d,rec[i].f,(long)rec[i].l,(int)rec[i].i); - - printf("\n"); - sarray_sort(struct rec,rec,500, l,1, &buf); - for(i=0;i<500;++i) - printf("%g\t%g\t%ld\t%d\n", - rec[i].d,rec[i].f,(long)rec[i].l,(int)rec[i].i); - buffer_free(&buf); - return 0; -} - diff --git a/3rdParty/gslib/tests/sarray_transfer_test.c b/3rdParty/gslib/tests/sarray_transfer_test.c deleted file mode 100644 index aaf3b7fd0..000000000 --- a/3rdParty/gslib/tests/sarray_transfer_test.c +++ /dev/null @@ -1,93 +0,0 @@ -#include -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#include "comm.h" -#include "mem.h" -#include "sort.h" -#include "sarray_sort.h" -#include "crystal.h" -#include "sarray_transfer.h" - -typedef struct { - double d; - ulong l,l2; - uint i; - uint p; -} r_work; - -int main(int narg, char *arg[]) -{ - comm_ext world; int np; - struct comm comm; - struct crystal crystal; - struct array A, A0=null_array; r_work *row, *row_0; - uint i; -#ifdef MPI - MPI_Init(&narg,&arg); - world = MPI_COMM_WORLD; - MPI_Comm_size(world,&np); -#else - world=0, np=1; -#endif - - comm_init(&comm,world); - crystal_init(&crystal,&comm); - - array_init(r_work,&A,np*3), A.n=np*3, row=A.ptr; - for(i=0;i %02d: %08x %08x %d %g\n", - (int)comm.id,(int)row[i].p,(int)row[i].i, - (int)row[i].l,(int)row[i].p,row[i].d); - - array_cat(r_work,&A0, row,A.n); - - sarray_transfer(r_work,&A, p,1, &crystal); - - row=A.ptr; - for(i=0;i -#include -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#include "mem.h" -#include "sort.h" - -#define SMALL 22 -#define NUM 500 -#define SI 9 - -ulong A[NUM][SI], Av[NUM]; -uint B[NUM][SI], Bv[NUM]; - -uint P[NUM], Q[NUM]; - -int main() -{ - buffer buf = {0,0,0}; - uint i; - - /*buffer_init(&buf, sortp_long_worksize(NUM,0));*/ - -#if 0 - printf("\nsource:\n"); -#endif - for(i=0;i!=NUM;++i) { - A[i][0]=rand(); - A[i][0]<<=CHAR_BIT*sizeof(int)-1; - A[i][0]^=rand(); - A[i][0]<<=CHAR_BIT*sizeof(int)-1; - A[i][0]^=rand(); - if(0) A[i][0]&=0x000ff00; - B[i][0]=A[i][0]; -#if 0 - printf("%016lx\t%016lx\n",(unsigned long)A[i][0],(unsigned long)B[i][0]); -#endif - } -#if 0 - printf("\n"); -#endif - printf("merge sort:\n"); - for(i=0;i!=SMALL;++i) Q[i]=SMALL-1-i; - sortv_long(Av, &A[0][0],SMALL,sizeof(ulong[SI]), &buf); - sortp_long(&buf,0, &A[0][0],SMALL,sizeof(ulong[SI])); - memcpy(P,buf.ptr,SMALL*sizeof(uint)); - memcpy(buf.ptr,Q,SMALL*sizeof(uint)); - sortp_long(&buf,1, &A[0][0],SMALL,sizeof(ulong[SI])); - memcpy(Q,buf.ptr,SMALL*sizeof(uint)); - for(i=0;i!=SMALL;++i) - printf("%u\t%u\t%016lx\t%d\t%d\n",(unsigned)P[i],(unsigned)Q[i], - (unsigned long)A[P[i]][0], - A[P[i]][0]==A[Q[i]][0], - Av[i]==A[P[i]][0]); - printf("\n"); - printf("radix sort:\n"); - for(i=0;i!=NUM;++i) Q[i]=NUM-1-i; - sortv_long(Av, &A[0][0],NUM,sizeof(ulong[SI]), &buf); - sortp_long(&buf,0, &A[0][0],NUM,sizeof(ulong[SI])); - memcpy(P,buf.ptr,NUM*sizeof(uint)); - memcpy(buf.ptr,Q,NUM*sizeof(uint)); - sortp_long(&buf,1, &A[0][0],NUM,sizeof(ulong[SI])); - memcpy(Q,buf.ptr,NUM*sizeof(uint)); - for(i=0;i!=NUM;++i) - printf("%u\t%u\t%016lx\t%d\t%d\n",(unsigned)P[i],(unsigned)Q[i], - (unsigned long)A[P[i]][0], - A[P[i]][0]==A[Q[i]][0], - Av[i]==A[P[i]][0]); - - printf("\nsmall integers:\n"); - printf("\n"); - - printf("heap sort:\n"); - for(i=0;i!=SMALL;++i) Q[i]=SMALL-1-i; - sortv(Q, Q,SMALL,sizeof(uint), &buf); - for(i=0;i!=SMALL;++i) printf("\t%u\n",(unsigned)Q[i]); - - printf("merge sort:\n"); - for(i=0;i!=SMALL;++i) Q[i]=SMALL-1-i; - sortv(Bv, &B[0][0],SMALL,sizeof(uint[SI]), &buf); - sortp(&buf,0, &B[0][0],SMALL,sizeof(uint[SI])); - memcpy(P,buf.ptr,SMALL*sizeof(uint)); - memcpy(buf.ptr,Q,SMALL*sizeof(uint)); - sortp(&buf,1, &B[0][0],SMALL,sizeof(uint[SI])); - memcpy(Q,buf.ptr,SMALL*sizeof(uint)); - for(i=0;i!=SMALL;++i) - printf("%u\t%u\t%016lx\t%d\t%d\n",(unsigned)P[i],(unsigned)Q[i], - (unsigned long)B[P[i]][0], - B[P[i]][0]==B[Q[i]][0], - B[P[i]][0]==Bv[i]); - printf("\n"); - printf("radix sort:\n"); - for(i=0;i!=NUM;++i) Q[i]=NUM-1-i; - sortv(Bv, &B[0][0],NUM,sizeof(uint[SI]), &buf); - sortp(&buf,0, &B[0][0],NUM,sizeof(uint[SI])); - memcpy(P,buf.ptr,NUM*sizeof(uint)); - memcpy(buf.ptr,Q,NUM*sizeof(uint)); - sortp(&buf,1, &B[0][0],NUM,sizeof(uint[SI])); - memcpy(Q,buf.ptr,NUM*sizeof(uint)); - for(i=0;i!=NUM;++i) - printf("%u\t%u\t%016lx\t%d\t%d\n",(unsigned)P[i],(unsigned)Q[i], - (unsigned long)B[P[i]][0], - B[P[i]][0]==B[Q[i]][0], - B[P[i]][0]==Bv[i]); - buffer_free(&buf); - return 0; -} - diff --git a/3rdParty/gslib/tests/sort_test2.c b/3rdParty/gslib/tests/sort_test2.c deleted file mode 100644 index d3ed601bb..000000000 --- a/3rdParty/gslib/tests/sort_test2.c +++ /dev/null @@ -1,58 +0,0 @@ -#include -#include -#include -#include -#include -#include "c99.h" -#include "name.h" -#include "fail.h" -#include "types.h" -#include "mem.h" -#include "sort.h" - -#if 1 - -#define N (1<<20) - -ulong A[N], out[N]; -uint P[N]; - -int main() -{ - buffer buf = null_buffer; - uint i; - unsigned long long tic, toc; - unsigned r; - - for(i=0;i!=N;++i) { - A[i]=rand(); - A[i]<<=CHAR_BIT*sizeof(int)-1; - A[i]^=rand(); - A[i]<<=CHAR_BIT*sizeof(int)-1; - A[i]^=rand(); - if(0) A[i]&=0x000ff00; - } - - for(i=N;i;i>>=1) { - unsigned long long t; - sortv_long(out, A,i,sizeof(ulong), &buf); - } - - for(i=N;i;i>>=1) { - unsigned long long t; - sortp_long(&buf,0, A,i,sizeof(ulong)); - } - - buffer_free(&buf); - return 0; -} - -#else - -int main() -{ - return 0; -} - -#endif - diff --git a/LICENSE b/LICENSE index 87572be49..135c3fa96 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (c) 2017-2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index b57f12847..796a2c49e 100644 --- a/README.md +++ b/README.md @@ -16,11 +16,10 @@ If you use any part of libParanumal in your research project including variants @MISC{ChalmersKarakusAustinSwirydowiczWarburton2020, author = "Chalmers, N. and Karakus, A. and Austin, A. P. and Swirydowicz, K. and Warburton, T.", title = "{libParanumal}: a performance portable high-order finite element library", - year = "2020", + year = "2022", url = "https://github.com/paranumal/libparanumal", doi = "10.5281/zenodo.4004744", - note = "Release 0.4.0" - } + note = "Release 0.5.0"} see the [references](#10-references) section below for additional papers to reference about various aspects of the library. @@ -42,7 +41,9 @@ A. Supported elements: B. Mesh wrangling: - Gmsh format file loaders. - - Load balanced geometric partitioning using space filling curves (Hilbert or Morton ordering). + - Load balanced inertial partitioning. + - Load balanced multi-level spectral partitioning. + - Cuthill-Mckee local ordering C. Time integrators: - Adaptive rate Dormand-Prince order 5 Runge-Kutta. @@ -59,7 +60,7 @@ D. Iterative linear solvers: E. Elliptic solver: - Linear Poisson and screened Poisson potential solvers. - GPU-optimized matrix-vector products. - - p-type multigrid, algebraic multigrid, low-order SEMFEM, Overlapping Additive Schwarz, and Jacobi preconditioning. + - p-type multigrid, algebraic multigrid (smoothed and unsmoothed aggregation), low-order SEMFEM, Overlapping Additive Schwarz, and Jacobi preconditioning. - Matrix-free p-multigrid for fine levels of multigrid hierarchy. F. Heterogeneous accelerated flow solvers: @@ -75,45 +76,51 @@ F. Heterogeneous accelerated flow solvers: * Extrapolation-BDF integration in time. * Sub-cycling (Operator Integration Factor Splitting) for advection. -G. Dependencies: +G. Portability: + - Ships with the Open Concurrent Compute Abstraction (OCCA) + - At build time, OCCA will try to detect if any of these execution models are installed: OpenMP, CUDA, OpenCL, HIP, and/or SYCL. + - Execution model can be selected at runtime. + - If OCCA does not detect a chosen mode of execution it will default to Serial execution. + - You will need to adjust the libParnumal setup input files to choose the execution model and compute device appropriate for your system. + +H. Dependencies: - Message Passing Interface (MPI v3.0 or higher). * The libParanumal makefiles assume that mpic++ is installed and visible in your path. - - Open Concurrent Compute Abstraction (OCCA) - * OCCA must be installed. - * OCCA will try to detect if any of these execution models are installed: OpenMP, CUDA, OpenCL, and/or HIP. - * By default, if OCCA does not detect a chosen mode of execution it will default to Serial execution. - * You will need to adjust the libParnumal setup input files to choose the execution model and compute device appropriate for your system. - * The OCCA github repo is [here](https://github.com/libocca/occa) - * The OCCA webpage is [here](http://libocca.org) + --- ### 4. Code block diagram - +![libParnumal Code Diagram](./.github/CodeDiagram.png) --- ### 5. OCCA dependency -OCCA is held as a git submodule inside libParanumal. If you did not clone with `--recursive` then run the following command before building. -`git submodule init` -`git submodule update` +OCCA is held as a git submodule inside libParanumal. If you did not clone with `--recursive` then run the following commands before building. +``` +git submodule init +git submodule update +``` --- ### 6. Required Libraries -libParanumal requires installed BLAS and LAPACK libraries. By default, the build system will look for `libblas` and `liblapack` in your default library search paths. The library paths can also be manually specified in `make.top` with the `LIBP_BLAS_DIR` and `LIBP_LAPACK_DIR` variables. +libParanumal requires installed BLAS and LAPACK libraries. By default, the build system will look for a serial (i.e. non-threaded) OpenBLAS in your default library search paths. The BLAS and LAPACK library paths can also be manually specified in `make.top` with the `LIBP_BLAS_DIR` and `LIBP_BLAS_LIB` variables. -Some Linux distributions will package BLAS and LAPACK libraries. For example, on Ubuntu systems these libraries can be installed via -```sudo apt install libblas-dev liblapack-dev``` - - -libParanumal also depends on the [gslib](https://github.com/Nek5000/gslib) library for gather-scatter operations. For more information on gslib see [Henry Tufo's thesis](https://dl.acm.org/doi/book/10.5555/926758) and a more recent reference [Fischer et al.](https://iopscience.iop.org/article/10.1088/1742-6596/125/1/012076/meta). The source code for gslib is included in this repository. +Some Linux distributions will package a serial OpenBLAS library. For example, on Ubuntu systems this libraries can be installed via +``` +sudo apt install libopenblas-serial-dev +``` --- ### 7. Clone: libParanumal -`git clone https://github.com/paranumal/libparanumal` +``` +git clone --recursive https://github.com/paranumal/libparanumal +``` #### 7-1. Build all libParanumal solvers -`cd libparanumal` -```make -j `nproc` ``` +``` +cd libparanumal +make -j `nproc` +``` --- ### 8. Running the codes: @@ -122,16 +129,30 @@ Each solver resides in its respective sub-directory in `solvers/`. Each solver s #### 8-1. Build libParanumal elliptic solver -`cd libparanumal/solvers/elliptic` -```make -j `nproc` ``` +``` +cd libparanumal/solvers/elliptic +make -j `nproc` +``` #### 8-2. Run elliptic example with provided quadrilateral set up file on a single device: -`./ellipticMain setups/setupQuad2D.rc` +libParanumal will make use of extra CPU cores if available. It is therefore beneficial to bind the MPI process to several CPU cores, if possible. For example, running the libParanumal elliptic solver with OpenMPI on a system with 16 CPU cores can be done via + +``` +mpiexec -np 1 --map-by slot:PE=16 ./ellipticMain setups/setupQuad2D.rc +``` + +The number of CPU cores used can also be controlled with the `OMP_NUM_THREADS` environment variable. libParanumal will not use more threads then there are physical CPU cores on the system, however, even in the presence of this environment variable. #### 8-3. Run the same example with four devices: -`mpiexec -n 4 ./ellipticMain setups/setupQuad2D.rc` +As the number of MPI processes per system increases, it is advisable to reduce the number of CPU cores per process to avoid oversubscribing the CPU cores. Using the same example above of the libParanumal elliptic solver with OpenMPI on a system with 16 CPU cores, a four rank run could be done via + +``` +mpiexec -np 4 --map-by slot:PE=4 ./ellipticMain setups/setupQuad2D.rc +``` + +i.e. each process binds to four of the 16 CPU cores available. --- @@ -139,7 +160,7 @@ Each solver resides in its respective sub-directory in `solvers/`. Each solver s The MIT License (MIT) -Copyright (c) 2017-2021 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -182,6 +203,10 @@ Low-order preconditioning of triangular elements (elliptic precon): [publisher]( ### 11. Technical Reports +CEED MS38: [link](https://doi.org/10.5281/zenodo.6475857): `Kolev, T., Fischer, P., Abdelfattah, A., Beams, N., Brown, J., Camier, J.-S., Carson, R., Chalmers, N., Dobrev, V., Dudouit, Y., Ghaffari, L., Joshi, A. Y., Kerkemeier, S., Lan, Y.-H., McDougall, D., Medina, D., Min, M., Mishra, A., Pazner, W., Phillips, M., Ratnayaka, T., Shephard, M. S., Siboni, M. H., Smith, C. W., Thompson, J. L., Tomboulides, A., Tomov, S., Tomov, V., Warburton, T., 2022. ECP Milestone Report: High-order algorithmic developments and optimizations for more robust exascale applications, WBS 2.2.6.06, Milestone CEED-MS38.` + +CEED MS37: [link](https://doi.org/10.5281/zenodo.5542244): `Kolev, T., Fischer, P., Beams, N., Brown, J., Camier, J.-S., Chalmers, N., Dobrev, V., Dudouit, Y., Kerkemeier, S., Lan, Y.-H., Lin, Y., Lindquist, N., McDougall, D., Medina, D., Merzari, E., Min, M., Moe, S., Pazner, W., Phillips, M., Ratnayaka, T., Rowe, K., Shephard, M. S., Smith, C. W., Tomov, S., Warburton, T., 2022. CEED ECP Milestone Report: Port and optimize the CEED software stack to Aurora / Frontier EA Systems, WBS 2.2.6.06, Milestone CEED-MS37.` + CEED MS36: [link](https://doi.org/10.5281/zenodo.4672664): `Kolev, T., Fischer, P., Austin, A.P., Barker, A.T., Beams, N., Brown, J., Camier, J.-S., Chalmers, N., Dobrev, V., Dudouit, Y., Ghaffari, L., Kerkemeier, S., Lan, Y.-H., Merzari, E., Min, M., Pazner, W., Ratnayaka, T., Shephard, M. S., Siboni, M.H., Smith, C.W., Thompson, J.L., Tomov, S., Warburton, T., 2021. ECP Milestone Report: High-order algorithmic developments and optimizations for large-scale GPU-accelerated simulations, WBS 2.2.6.06, Milestone CEED-MS36.` CEED MS35: [link](https://doi.org/10.5281/zenodo.4146400): `Kolev, T., Fischer, P., Abdelfattah, A., Barra, V., Beams, N., Brown, J., Camier, J.S., Chalmers, N., Dobrev, V., Kerkemeier, S., Lan, Y.H., Merzari, E., Min, M., Phillips, M., Ratnayaka, T., Rowe, K., Thompson, J., Tomboulides, A., Tomov, S., Tomov, V,. and Warburton, T., 2020. ECP Milestone Report: Support CEED-enabled ECP applications in their preparation for Aurora/Frontier, WBS 2.2.6.06, Milestone CEED-MS35.` diff --git a/include/comm.hpp b/include/comm.hpp new file mode 100644 index 000000000..4cd03dddc --- /dev/null +++ b/include/comm.hpp @@ -0,0 +1,565 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#ifndef LIBP_COMM_HPP +#define LIBP_COMM_HPP + +#include +#include "core.hpp" + +namespace libp { + +#define MAX_PROCESSOR_NAME MPI_MAX_PROCESSOR_NAME + +/*Generic data type*/ +template +struct mpiType { + static MPI_Datatype getMpiType() { + MPI_Datatype type; + MPI_Type_contiguous(sizeof(T), MPI_CHAR, &type); + MPI_Type_commit(&type); + return type; + } + static void freeMpiType(MPI_Datatype type) { + MPI_Type_free(&type); + } + static constexpr bool isMpiType() { return false; } +}; + +/*Pre-defined MPI datatypes*/ +#define TYPE(T, MPI_T) \ +template<> struct mpiType { \ + static MPI_Datatype getMpiType() { return MPI_T; } \ + static void freeMpiType(MPI_Datatype type) { } \ + static constexpr bool isMpiType() { return true; } \ +} + +TYPE(char, MPI_CHAR); +TYPE(int, MPI_INT); +TYPE(long long int, MPI_LONG_LONG_INT); +TYPE(float, MPI_FLOAT); +TYPE(double, MPI_DOUBLE); +#undef TYPE + +class comm_t; + +namespace Comm { + + using request_t = MPI_Request; + + /*Predefined ops*/ + using op_t = MPI_Op; + static constexpr op_t Max = MPI_MAX; + static constexpr op_t Min = MPI_MIN; + static constexpr op_t Sum = MPI_SUM; + static constexpr op_t Prod = MPI_PROD; + static constexpr op_t And = MPI_LAND; + static constexpr op_t Or = MPI_LOR; + static constexpr op_t Xor = MPI_LXOR; + + /*MPI_Init and MPI_Finalize*/ + void Init(int &argc, char** &argv); + void Finalize(); + + /*handle to MPI_COMM_WORLD*/ + comm_t World(); + + void GetProcessorName(char* name, int &namelen); + +} //namespace Comm + +/*Communicator class*/ +class comm_t { + + private: + std::shared_ptr comm_ptr; + int _rank=0; + int _size=0; + + public: + comm_t() = default; + comm_t(const comm_t &c) = default; + comm_t& operator = (const comm_t &c)=default; + + /*MPI_Comm_dup and MPI_Comm_delete*/ + comm_t Dup() const; + comm_t Split(const int color, const int key) const; + void Free(); + + /*Rank and size getters*/ + const int rank() const; + const int size() const; + + /*MPI_Comm getter*/ + MPI_Comm comm() const; + + /*libp::memory send*/ + template class mem, typename T> + void Send(mem m, + const int dest, + const int count=-1, + const int tag=0) const { + MPI_Datatype type = mpiType::getMpiType(); + const int cnt = (count==-1) ? static_cast(m.length()) : count; + MPI_Send(m.ptr(), cnt, type, dest, tag, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory recv*/ + template class mem, typename T> + void Recv(mem m, + const int source, + const int count=-1, + const int tag=0) const { + MPI_Datatype type = mpiType::getMpiType(); + const int cnt = (count==-1) ? static_cast(m.length()) : count; + MPI_Recv(m.ptr(), cnt, type, source, tag, comm()); + mpiType::freeMpiType(type); + } + + /*scalar send*/ + template + void Send(T& val, + const int dest, + const int tag=0) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Send(&val, 1, type, dest, tag, comm()); + mpiType::freeMpiType(type); + } + + /*scalar recv*/ + template + void Recv(T& val, + const int source, + const int tag=0) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Recv(&val, 1, type, source, tag, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory non-blocking send*/ + template class mem, typename T> + void Isend(mem m, + const int dest, + const int count, + const int tag, + Comm::request_t &request) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Isend(m.ptr(), count, type, dest, tag, comm(), &request); + mpiType::freeMpiType(type); + } + + /*libp::memory non-blocking recv*/ + template class mem, typename T> + void Irecv(mem m, + const int source, + const int count, + const int tag, + Comm::request_t &request) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Irecv(m.ptr(), count, type, source, tag, comm(), &request); + mpiType::freeMpiType(type); + } + + /*scalar non-blocking send*/ + template + void Isend(T& val, + const int dest, + const int tag, + Comm::request_t &request) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Isend(&val, 1, type, dest, tag, comm(), &request); + mpiType::freeMpiType(type); + } + + /*scalar non-blocking recv*/ + template + void Irecv(T& val, + const int source, + const int tag, + Comm::request_t &request) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Irecv(&val, 1, type, source, tag, comm(), &request); + mpiType::freeMpiType(type); + } + + /*libp::memory broadcast*/ + template class mem, typename T> + void Bcast(mem m, + const int root, + const int count=-1) const { + MPI_Datatype type = mpiType::getMpiType(); + const int cnt = (count==-1) ? static_cast(m.length()) : count; + MPI_Bcast(m.ptr(), cnt, type, root, comm()); + mpiType::freeMpiType(type); + } + + /*scalar broadcast*/ + template + void Bcast(T& val, + const int root) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Bcast(&val, 1, type, root, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory reduce*/ + template class mem, typename T> + void Reduce(const mem snd, + mem rcv, + const int root, + const Comm::op_t op = Comm::Sum, + const int count=-1) const { + MPI_Datatype type = mpiType::getMpiType(); + const int cnt = (count==-1) ? static_cast(snd.length()) : count; + MPI_Reduce(snd.ptr(), rcv.ptr(), cnt, type, op, root, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory in-place reduce*/ + template class mem, typename T> + void Reduce(mem m, + const int root, + const Comm::op_t op = Comm::Sum, + const int count=-1) const { + MPI_Datatype type = mpiType::getMpiType(); + const int cnt = (count==-1) ? static_cast(m.length()) : count; + if (_rank==root) { + MPI_Reduce(MPI_IN_PLACE, m.ptr(), cnt, type, op, root, comm()); + } else { + MPI_Reduce(m.ptr(), nullptr, cnt, type, op, root, comm()); + } + mpiType::freeMpiType(type); + } + + /*scalar reduce*/ + template + void Reduce(const T& snd, + T& rcv, + const int root, + const Comm::op_t op = Comm::Sum) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Reduce(&snd, &rcv, 1, type, op, root, comm()); + mpiType::freeMpiType(type); + } + template + void Reduce(T& val, + const int root, + const Comm::op_t op = Comm::Sum) const { + T rcv=val; + Reduce(val, rcv, root, op); + if (rank()==root) val=rcv; + } + + /*libp::memory allreduce*/ + template class mem, typename T> + void Allreduce(const mem snd, + mem rcv, + const Comm::op_t op = Comm::Sum, + const int count=-1) const { + MPI_Datatype type = mpiType::getMpiType(); + const int cnt = (count==-1) ? static_cast(snd.length()) : count; + MPI_Allreduce(snd.ptr(), rcv.ptr(), cnt, type, op, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory in-place allreduce*/ + template class mem, typename T> + void Allreduce(mem m, + const Comm::op_t op = Comm::Sum, + const int count=-1) const { + MPI_Datatype type = mpiType::getMpiType(); + const int cnt = (count==-1) ? static_cast(m.length()) : count; + MPI_Allreduce(MPI_IN_PLACE, m.ptr(), cnt, type, op, comm()); + mpiType::freeMpiType(type); + } + + /*scalar allreduce*/ + template + void Allreduce(const T& snd, + T& rcv, + const Comm::op_t op = Comm::Sum) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Allreduce(&snd, &rcv, 1, type, op, comm()); + mpiType::freeMpiType(type); + } + template + void Allreduce(T& val, + const Comm::op_t op = Comm::Sum) const { + T rcv=val; + Allreduce(val, rcv, op); + val = rcv; + } + + /*libp::memory non-blocking allreduce*/ + template class mem, typename T> + void Iallreduce(const mem snd, + mem rcv, + const Comm::op_t op, + const int count, + Comm::request_t &request) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Iallreduce(snd.ptr(), rcv.ptr(), count, type, op, comm(), &request); + mpiType::freeMpiType(type); + } + + /*libp::memory non-blocking in-place allreduce*/ + template class mem, typename T> + void Iallreduce(mem m, + const Comm::op_t op, + const int count, + Comm::request_t &request) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Iallreduce(MPI_IN_PLACE, m.ptr(), count, type, op, comm(), &request); + mpiType::freeMpiType(type); + } + + /*scalar non-blocking allreduce*/ + template class mem, typename T> + void Iallreduce(const T& snd, + T& rcv, + const Comm::op_t op, + Comm::request_t &request) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Iallreduce(&snd, &rcv, 1, type, op, comm(), &request); + mpiType::freeMpiType(type); + } + /*scalar non-blocking in-place allreduce*/ + template class mem, typename T> + void Iallreduce(T& val, + const Comm::op_t op, + Comm::request_t &request) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Iallreduce(MPI_IN_PLACE, &val, 1, type, op, comm(), &request); + mpiType::freeMpiType(type); + } + + /*libp::memory scan*/ + template class mem, typename T> + void Scan(const mem snd, + mem rcv, + const Comm::op_t op = Comm::Sum, + const int count=-1) const { + MPI_Datatype type = mpiType::getMpiType(); + const int cnt = (count==-1) ? static_cast(snd.length()) : count; + MPI_Scan(snd.ptr(), rcv.ptr(), cnt, type, op, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory in-place scan*/ + template class mem, typename T> + void Scan(mem m, + const Comm::op_t op = Comm::Sum, + const int count=-1) const { + MPI_Datatype type = mpiType::getMpiType(); + const int cnt = (count==-1) ? static_cast(m.length()) : count; + MPI_Scan(MPI_IN_PLACE, m.ptr(), cnt, type, op, comm()); + mpiType::freeMpiType(type); + } + + /*scalar scan*/ + template + void Scan(const T& snd, + T& rcv, + const Comm::op_t op = Comm::Sum) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Scan(&snd, &rcv, 1, type, op, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory gather*/ + template class mem, typename T> + void Gather(const mem snd, + mem rcv, + const int root, + const int sendCount=-1) const { + MPI_Datatype type = mpiType::getMpiType(); + const int cnt = (sendCount==-1) ? static_cast(snd.length()) : sendCount; + MPI_Gather(snd.ptr(), cnt, type, + rcv.ptr(), cnt, type, root, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory gatherv*/ + template class mem, typename T> + void Gatherv(const mem snd, + const int sendcount, + mem rcv, + const memory recvCounts, + const memory recvOffsets, + const int root) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Gatherv(snd.ptr(), sendcount, type, + rcv.ptr(), recvCounts.ptr(), recvOffsets.ptr(), type, + root, comm()); + mpiType::freeMpiType(type); + } + + /*scalar gather*/ + template class mem, typename T> + void Gather(const T& snd, + mem rcv, + const int root) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Gather(&snd, 1, type, + rcv.ptr(), 1, type, root, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory scatter*/ + template class mem, typename T> + void Scatter(const mem snd, + mem rcv, + const int root, + const int count=-1) const { + MPI_Datatype type = mpiType::getMpiType(); + const int cnt = (count==-1) ? static_cast(rcv.length()) : count; + MPI_Scatter(snd.ptr(), cnt, type, + rcv.ptr(), cnt, type, root, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory scatterv*/ + template class mem, typename T> + void Scatterv(const mem snd, + const memory sendCounts, + const memory sendOffsets, + mem rcv, + const int recvcount, + const int root) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Scatterv(snd.ptr(), sendCounts.ptr(), sendOffsets.ptr(), type, + rcv.ptr(), recvcount, type, + root, comm()); + mpiType::freeMpiType(type); + } + + /*scalar scatter*/ + template class mem, typename T> + void Scatter(T& rcv, + const mem snd, + const int root) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Scatter(snd.ptr, 1, type, + &rcv, 1, type, root, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory allgather*/ + template class mem, typename T> + void Allgather(const mem snd, + mem rcv, + const int sendCount=-1) const { + MPI_Datatype type = mpiType::getMpiType(); + const int cnt = (sendCount==-1) ? static_cast(snd.length()) : sendCount; + MPI_Allgather(snd.ptr(), cnt, type, + rcv.ptr(), cnt, type, comm()); + mpiType::freeMpiType(type); + } + template class mem, typename T> + void Allgather(mem m, + const int cnt) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Allgather(MPI_IN_PLACE, cnt, type, + m.ptr(), cnt, type, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory allgatherv*/ + template class mem, typename T> + void Allgatherv(const mem snd, + const int sendcount, + mem rcv, + const memory recvCounts, + const memory recvOffsets) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Allgatherv(snd.ptr(), sendcount, type, + rcv.ptr(), recvCounts.ptr(), recvOffsets.ptr(), type, + comm()); + mpiType::freeMpiType(type); + } + + /*scalar allgather*/ + template class mem, typename T> + void Allgather(const T& snd, + mem rcv) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Allgather(&snd, 1, type, + rcv.ptr(), 1, type, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory alltoall*/ + template class mem, typename T> + void Alltoall(const mem snd, + mem rcv, + const int cnt=1) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Alltoall(snd.ptr(), cnt, type, + rcv.ptr(), cnt, type, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory alltoallv*/ + template class mem, typename T> + void Alltoallv(const mem snd, + const memory sendCounts, + const memory sendOffsets, + mem rcv, + const memory recvCounts, + const memory recvOffsets) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Alltoallv(snd.ptr(), sendCounts.ptr(), sendOffsets.ptr(), type, + rcv.ptr(), recvCounts.ptr(), recvOffsets.ptr(), type, + comm()); + mpiType::freeMpiType(type); + } + + template class mem, typename T> + void Ialltoallv(const mem snd, + const memory sendCounts, + const memory sendOffsets, + mem rcv, + const memory recvCounts, + const memory recvOffsets, + Comm::request_t &request) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Ialltoallv(snd.ptr(), sendCounts.ptr(), sendOffsets.ptr(), type, + rcv.ptr(), recvCounts.ptr(), recvOffsets.ptr(), type, + comm(), &request); + mpiType::freeMpiType(type); + } + + void Wait(Comm::request_t &request) const; + void Waitall(const int count, memory &requests) const; + void Barrier() const; + + friend comm_t Comm::World(); +}; + +} //namespace libp + +#endif diff --git a/include/core.hpp b/include/core.hpp index 0319e9d87..6cbb04d8c 100644 --- a/include/core.hpp +++ b/include/core.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,58 +27,28 @@ SOFTWARE. #ifndef CORE_HPP #define CORE_HPP -#include -#include -#include -#include -#include -#include #include "utils.hpp" +#include "memory.hpp" +#include "comm.hpp" -// sort entries in an array in parallel -void parallelSort(int size, int rank, MPI_Comm comm, - int N, void *vv, size_t sz, - int (*compare)(const void *, const void *), - void (*match)(void *, void *) - ); +namespace libp { // find a factorization n = nx*ny such that // nx>=ny are 'close' to one another -void factor2(const int n, int &nx, int &ny); +void Factor2(const int n, int &nx, int &ny); + +void RankDecomp2(int size_x, int size_y, + int &rank_x, int &rank_y, + const int rank); // find a factorization n = nx*ny*nz such that // nx>=ny>=nz are all 'close' to one another -void factor3(const int n, int &nx, int &ny, int &nz); - -void matrixRightSolve(int NrowsA, int NcolsA, double *A, int NrowsB, int NcolsB, double *B, double *C); -void matrixRightSolve(int NrowsA, int NcolsA, float *A, int NrowsB, int NcolsB, float *B, float *C); -void matrixUnderdeterminedRightSolveMinNorm(int NrowsA, int NcolsA, dfloat *A, dfloat *b, dfloat *x); -void matrixUnderdeterminedRightSolveCPQR(int NrowsA, int NcolsA, dfloat *A, dfloat *b, dfloat *x); - -void matrixEigenVectors(int N, double *A, double *VR, double *WR, double *WI); -void matrixEigenVectors(int N, float *A, float *VR, float *WR, float *WI); - -void matrixEigenValues(int N, double *A, double *WR, double *WI); -void matrixEigenValues(int N, float *A, float *WR, float *WI); - -void matrixInverse(int N, double *A); -void matrixInverse(int N, float *A); - -double matrixConditionNumber(int N, double *A); -float matrixConditionNumber(int N, float *A); +void Factor3(const int n, int &nx, int &ny, int &nz); -void matrixTranspose(const int M, const int N, - const double *A, const int LDA, - double *AT, const int LDAT); -void matrixTranspose(const int M, const int N, - const float *A, const int LDA, - float *AT, const int LDAT); +void RankDecomp3(int size_x, int size_y, int size_z, + int &rank_x, int &rank_y, int &rank_z, + const int rank); -void matrixTranspose(const int M, const int N, - const int *A, const int LDA, - int *AT, const int LDAT); -void matrixTranspose(const int M, const int N, - const long long int *A, const int LDA, - long long int *AT, const int LDAT); +} //namespace libp #endif diff --git a/include/initialGuess.hpp b/include/initialGuess.hpp index c30539018..5050ba541 100644 --- a/include/initialGuess.hpp +++ b/include/initialGuess.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Anthony Austin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,146 +27,136 @@ SOFTWARE. #ifndef INITIALGUESS_HPP #define INITIALGUESS_HPP -#include "linearSolver.hpp" +#include "core.hpp" +#include "platform.hpp" +#include "solver.hpp" + +namespace libp { + +namespace InitialGuess { + + +void AddSettings(settings_t& settings, const std::string prefix = ""); // Abstract base class for different initial guess strategies. class initialGuessStrategy_t { -protected: - platform_t& platform; - settings_t& settings; - MPI_Comm comm; + protected: + platform_t platform; + settings_t settings; + comm_t comm; dlong Ntotal; // Degrees of freedom -public: - initialGuessStrategy_t(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm); - virtual ~initialGuessStrategy_t(); + public: + initialGuessStrategy_t(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm): + platform(_platform), settings(_settings), comm(_comm), Ntotal(_N) {} - virtual void FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs) = 0; - virtual void Update(solver_t& solver, occa::memory& o_x, occa::memory& o_rhs) = 0; + virtual void FormInitialGuess(deviceMemory& o_x, deviceMemory& o_rhs) = 0; + virtual void Update(operator_t& linearOperator, deviceMemory& o_x, deviceMemory& o_rhs) = 0; }; // Default initial guess strategy: use whatever the user gave us. -class igDefaultStrategy : public initialGuessStrategy_t { +class Default : public initialGuessStrategy_t { public: - igDefaultStrategy(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm); + Default(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm); - void FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs); - void Update(solver_t &solver, occa::memory& o_x, occa::memory& o_rhs); + void FormInitialGuess(deviceMemory& o_x, deviceMemory& o_rhs); + void Update(operator_t &linearOperator, deviceMemory& o_x, deviceMemory& o_rhs); }; // Zero initial guess strategy: use a zero initial guess. -class igZeroStrategy : public initialGuessStrategy_t { +class Zero : public initialGuessStrategy_t { public: - igZeroStrategy(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm); + Zero(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm); - void FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs); - void Update(solver_t &solver, occa::memory& o_x, occa::memory& o_rhs); + void FormInitialGuess(deviceMemory& o_x, deviceMemory& o_rhs); + void Update(operator_t &linearOperator, deviceMemory& o_x, deviceMemory& o_rhs); }; // Initial guess strategies based on RHS projection. -class igProjectionStrategy : public initialGuessStrategy_t { +class Projection : public initialGuessStrategy_t { protected: dlong curDim; // Current dimension of the initial guess space dlong maxDim; // Maximum dimension of the initial guess space - occa::memory o_btilde; // vector (e.g., to be added to space) - occa::memory o_xtilde; // Solution vector corresponding to o_btilde - occa::memory o_Btilde; // space (orthogonalized) - occa::memory o_Xtilde; // Solution space corresponding to space + deviceMemory o_btilde; // vector (e.g., to be added to space) + deviceMemory o_xtilde; // Solution vector corresponding to o_btilde + deviceMemory o_Btilde; // space (orthogonalized) + deviceMemory o_Xtilde; // Solution space corresponding to space // temporary buffer for basis inner product output dlong ctmpNblocks; - dfloat *ctmp; - occa::memory o_ctmp; + pinnedMemory ctmp; + deviceMemory o_ctmp; - dfloat *alphas; // Buffers for storing inner products. - dfloat *alphasThisRank; - occa::memory o_alphas; + pinnedMemory alphas; // Buffers for storing inner products. + deviceMemory o_alphas; - occa::kernel igBasisInnerProductsKernel; - occa::kernel igReconstructKernel; - occa::kernel igScaleKernel; - occa::kernel igUpdateKernel; + kernel_t igBasisInnerProductsKernel; + kernel_t igReconstructKernel; + kernel_t igScaleKernel; + kernel_t igUpdateKernel; - void igBasisInnerProducts(occa::memory& o_x, occa::memory& o_Q, occa::memory& o_c, dfloat *c, dfloat *cThisRank); - void igReconstruct(occa::memory& o_u, dfloat a, occa::memory& o_c, occa::memory& o_Q, occa::memory& o_unew); + void igBasisInnerProducts(deviceMemory& o_x, deviceMemory& o_Q, deviceMemory& o_c, pinnedMemory& c); + void igReconstruct(deviceMemory& o_u, dfloat a, deviceMemory& o_c, deviceMemory& o_Q, deviceMemory& o_unew); public: - igProjectionStrategy(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm); - virtual ~igProjectionStrategy(); + Projection(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm); - virtual void FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs); - virtual void Update(solver_t& solver, occa::memory& o_x, occa::memory& o_rhs) = 0; + virtual void FormInitialGuess(deviceMemory& o_x, deviceMemory& o_rhs); + virtual void Update(operator_t& linearOperator, deviceMemory& o_x, deviceMemory& o_rhs) = 0; }; // "Classic" initial guess strategy from Fischer's 1998 paper. -class igClassicProjectionStrategy : public igProjectionStrategy { +class ClassicProjection : public Projection { public: - igClassicProjectionStrategy(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm); + ClassicProjection(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm); - void Update(solver_t &solver, occa::memory& o_x, occa::memory& o_rhs); + void Update(operator_t &linearOperator, deviceMemory& o_x, deviceMemory& o_rhs); }; // Rolling QR update for projection history space a la Christensen's thesis. -class igRollingQRProjectionStrategy : public igProjectionStrategy { +class RollingQRProjection : public Projection { private: - dfloat *R; // R factor in QR decomposition (row major) - occa::memory o_R; + pinnedMemory R; // R factor in QR decomposition (row major) + deviceMemory o_R; - occa::kernel igDropQRFirstColumnKernel; + kernel_t igDropQRFirstColumnKernel; - void givensRotation(dfloat a, dfloat b, dfloat *c, dfloat *s); + void givensRotation(dfloat a, dfloat b, dfloat& c, dfloat& s); public: - igRollingQRProjectionStrategy(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm); - ~igRollingQRProjectionStrategy(); + RollingQRProjection(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm); - void Update(solver_t &solver, occa::memory& o_x, occa::memory& o_rhs); + void Update(operator_t &linearOperator, deviceMemory& o_x, deviceMemory& o_rhs); }; // Extrapolation initial guess strategy. -class igExtrapStrategy : public initialGuessStrategy_t { +class Extrap : public initialGuessStrategy_t { private: int Nhistory; int shift; int entry; - occa::memory o_xh; - occa::memory o_coeffs; - occa::kernel igExtrapKernel; - occa::kernel igExtrapSparseKernel; + deviceMemory o_xh; + deviceMemory o_coeffs; + kernel_t igExtrapKernel; + kernel_t igExtrapSparseKernel; int Nsparse; - occa::memory o_sparseIds; - occa::memory o_sparseCoeffs; + deviceMemory o_sparseIds; + deviceMemory o_sparseCoeffs; - void extrapCoeffs(int m, int M, dfloat *c); + void extrapCoeffs(int m, int M, memory c); public: - igExtrapStrategy(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm); + Extrap(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm); - void FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs); - void Update(solver_t &solver, occa::memory& o_x, occa::memory& o_rhs); + void FormInitialGuess(deviceMemory& o_x, deviceMemory& o_rhs); + void Update(operator_t &linearOperator, deviceMemory& o_x, deviceMemory& o_rhs); }; -// Linear solver with successive-RHS initial-guess generation. -class initialGuessSolver_t : public linearSolver_t { -protected: - initialGuessStrategy_t* igStrategy; // The initial guess strategy. - linearSolver_t* linearSolver; // The linearSolver_t that does the solve. - -public: - initialGuessSolver_t(dlong _N, dlong _Nhalo, platform_t& _platform, settings_t& _settings, MPI_Comm _comm); - ~initialGuessSolver_t(); - - static initialGuessSolver_t* Setup(dlong _N, dlong _Nhalo, - platform_t& platform, settings_t& settings, MPI_Comm _comm); - - int Solve(solver_t& solver, precon_t& precon, - occa::memory& o_x, occa::memory& o_rhs, - const dfloat tol, const int MAXIT, const int verbose); -}; +} //namespace InitialGuess -void initialGuessAddSettings(settings_t& settings, const string prefix = ""); +} //namespace libp #endif /* INITIALGUESS_HPP */ diff --git a/include/linAlg.hpp b/include/linAlg.hpp index 93fede004..71ed4d3c8 100644 --- a/include/linAlg.hpp +++ b/include/linAlg.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -28,126 +28,187 @@ SOFTWARE. #define LINALG_HPP #include "core.hpp" +#include "memory.hpp" -using std::vector; -using std::string; +namespace libp { class platform_t; //launcher for basic linear algebra OCCA kernels class linAlg_t { -public: - platform_t *platform; - occa::properties kernelInfo; - - int blocksize; - - //scratch space for reductions - dfloat *scratch; - occa::memory h_scratch; - occa::memory o_scratch; - + public: linAlg_t(); + linAlg_t(platform_t *_platform) { Setup(_platform); } void Setup(platform_t *_platform); //initialize list of kernels - void InitKernels(vector kernels); - - ~linAlg_t(); + void InitKernels(std::vector kernels); /*********************/ /* vector operations */ /*********************/ // o_a[n] = alpha - void set(const dlong N, const dfloat alpha, occa::memory& o_a); + void set(const dlong N, const dfloat alpha, deviceMemory o_a); // o_a[n] *= alpha - void scale(const dlong N, const dfloat alpha, occa::memory& o_a); + void scale(const dlong N, const dfloat alpha, deviceMemory o_a); // o_a[n] += alpha - void add(const dlong N, const dfloat alpha, occa::memory& o_a); + void add(const dlong N, const dfloat alpha, deviceMemory o_a); // o_y[n] = beta*o_y[n] + alpha*o_x[n] - void axpy(const dlong N, const dfloat alpha, occa::memory& o_x, - const dfloat beta, occa::memory& o_y); + void axpy(const dlong N, const dfloat alpha, deviceMemory o_x, + const dfloat beta, deviceMemory o_y); // o_z[n] = beta*o_y[n] + alpha*o_x[n] - void zaxpy(const dlong N, const dfloat alpha, occa::memory& o_x, - const dfloat beta, occa::memory& o_y, - occa::memory& o_z); + void zaxpy(const dlong N, const dfloat alpha, deviceMemory o_x, + const dfloat beta, deviceMemory o_y, + deviceMemory o_z); // o_x[n] = alpha*o_a[n]*o_x[n] void amx(const dlong N, const dfloat alpha, - occa::memory& o_a, occa::memory& o_x); + deviceMemory o_a, deviceMemory o_x); // o_y[n] = alpha*o_a[n]*o_x[n] + beta*o_y[n] void amxpy(const dlong N, const dfloat alpha, - occa::memory& o_a, occa::memory& o_x, - const dfloat beta, occa::memory& o_y); + deviceMemory o_a, deviceMemory o_x, + const dfloat beta, deviceMemory o_y); // o_z[n] = alpha*o_a[n]*o_x[n] + beta*o_y[n] void zamxpy(const dlong N, const dfloat alpha, - occa::memory& o_a, occa::memory& o_x, - const dfloat beta, occa::memory& o_y, occa::memory& o_z); + deviceMemory o_a, deviceMemory o_x, + const dfloat beta, deviceMemory o_y, deviceMemory o_z); // o_x[n] = alpha*o_x[n]/o_a[n] void adx(const dlong N, const dfloat alpha, - occa::memory& o_a, occa::memory& o_x); + deviceMemory o_a, deviceMemory o_x); // o_y[n] = alpha*o_x[n]/o_a[n] + beta*o_y[n] void adxpy(const dlong N, const dfloat alpha, - occa::memory& o_a, occa::memory& o_x, - const dfloat beta, occa::memory& o_y); + deviceMemory o_a, deviceMemory o_x, + const dfloat beta, deviceMemory o_y); // o_z[n] = alpha*o_x[n]/o_a[n] + beta*o_y[n] void zadxpy(const dlong N, const dfloat alpha, - occa::memory& o_a, occa::memory& o_x, - const dfloat beta, occa::memory& o_y, occa::memory& o_z); + deviceMemory o_a, deviceMemory o_x, + const dfloat beta, deviceMemory o_y, deviceMemory o_z); // \min o_a - dfloat min(const dlong N, occa::memory& o_a, MPI_Comm comm); + dfloat min(const dlong N, deviceMemory o_a, comm_t comm); // \max o_a - dfloat max(const dlong N, occa::memory& o_a, MPI_Comm comm); + dfloat max(const dlong N, deviceMemory o_a, comm_t comm); // \sum o_a - dfloat sum(const dlong N, occa::memory& o_a, MPI_Comm comm); + dfloat sum(const dlong N, deviceMemory o_a, comm_t comm); // ||o_a||_2 - dfloat norm2(const dlong N, occa::memory& o_a, MPI_Comm comm); + dfloat norm2(const dlong N, deviceMemory o_a, comm_t comm); // o_x.o_y - dfloat innerProd(const dlong N, occa::memory& o_x, occa::memory& o_y, - MPI_Comm comm); + dfloat innerProd(const dlong N, deviceMemory o_x, deviceMemory o_y, + comm_t comm); // ||o_a||_w2 - dfloat weightedNorm2(const dlong N, occa::memory& o_w, occa::memory& o_a, - MPI_Comm comm); + dfloat weightedNorm2(const dlong N, deviceMemory o_w, deviceMemory o_a, + comm_t comm); // o_w.o_x.o_y - dfloat weightedInnerProd(const dlong N, occa::memory& o_w, occa::memory& o_x, - occa::memory& o_y, MPI_Comm comm); - - occa::kernel setKernel; - occa::kernel addKernel; - occa::kernel scaleKernel; - occa::kernel axpyKernel; - occa::kernel zaxpyKernel; - occa::kernel amxKernel; - occa::kernel amxpyKernel; - occa::kernel zamxpyKernel; - occa::kernel adxKernel; - occa::kernel adxpyKernel; - occa::kernel zadxpyKernel; - occa::kernel minKernel; - occa::kernel maxKernel; - occa::kernel sumKernel; - occa::kernel norm2Kernel; - occa::kernel weightedNorm2Kernel; - occa::kernel innerProdKernel; - occa::kernel weightedInnerProdKernel; + dfloat weightedInnerProd(const dlong N, deviceMemory o_w, deviceMemory o_x, + deviceMemory o_y, comm_t comm); + + static void matrixRightSolve(const int NrowsA, const int NcolsA, const memory A, + const int NrowsB, const int NcolsB, const memory B, + memory C); + static void matrixRightSolve(const int NrowsA, const int NcolsA, const memory A, + const int NrowsB, const int NcolsB, const memory B, + memory C); + static void matrixUnderdeterminedRightSolveMinNorm(const int NrowsA, const int NcolsA, + const memory A, + const memory b, + memory x); + static void matrixUnderdeterminedRightSolveMinNorm(const int NrowsA, const int NcolsA, + const memory A, + const memory b, + memory x); + static void matrixUnderdeterminedRightSolveCPQR(const int NrowsA, const int NcolsA, + const memory A, + const memory b, + memory x); + static void matrixUnderdeterminedRightSolveCPQR(const int NrowsA, const int NcolsA, + const memory A, + const memory b, + memory x); + + static void matrixEigenVectors(const int N, const memory A, + memory VR, memory WR, memory WI); + static void matrixEigenVectors(const int N, const memory A, + memory VR, memory WR, memory WI); + + static void matrixEigenValues(const int N, const memory A, + memory WR, memory WI); + static void matrixEigenValues(const int N, const memory A, + memory WR, memory WI); + + static void matrixInverse(const int N, memory A); + static void matrixInverse(const int N, memory A); + + static double matrixConditionNumber(const int N, const memory A); + static float matrixConditionNumber(const int N, const memory A); + + static void matrixTranspose(const int M, const int N, + const memory A, const int LDA, + memory AT, const int LDAT); + static void matrixTranspose(const int M, const int N, + const memory A, const int LDA, + memory AT, const int LDAT); + + static void matrixTranspose(const int M, const int N, + const memory A, const int LDA, + memory AT, const int LDAT); + static void matrixTranspose(const int M, const int N, + const memory A, const int LDA, + memory AT, const int LDAT); + + private: + platform_t *platform; + properties_t kernelInfo; + + static constexpr int blocksize = 256; + + //scratch space for reductions + deviceMemory o_scratch; + pinnedMemory h_scratch; + + kernel_t setKernel; + kernel_t addKernel; + kernel_t scaleKernel; + kernel_t axpyKernel; + kernel_t zaxpyKernel; + kernel_t amxKernel; + kernel_t amxpyKernel; + kernel_t zamxpyKernel; + kernel_t adxKernel; + kernel_t adxpyKernel; + kernel_t zadxpyKernel; + kernel_t minKernel1; + kernel_t minKernel2; + kernel_t maxKernel1; + kernel_t maxKernel2; + kernel_t sumKernel1; + kernel_t sumKernel2; + kernel_t norm2Kernel1; + kernel_t norm2Kernel2; + kernel_t weightedNorm2Kernel1; + kernel_t weightedNorm2Kernel2; + kernel_t innerProdKernel1; + kernel_t innerProdKernel2; + kernel_t weightedInnerProdKernel1; + kernel_t weightedInnerProdKernel2; }; +} //namespace libp + #endif diff --git a/include/linearSolver.hpp b/include/linearSolver.hpp index 30f431008..84e2f0a62 100644 --- a/include/linearSolver.hpp +++ b/include/linearSolver.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Anthony Austin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -31,162 +31,190 @@ SOFTWARE. #include "platform.hpp" #include "solver.hpp" #include "precon.hpp" +#include "initialGuess.hpp" -//virtual base linear solver class +namespace libp { + +namespace LinearSolver { class linearSolverBase_t; } + +/* General LinearSolver object*/ class linearSolver_t { + public: + linearSolver_t() = default; + + /*Generic setup. Create a Solver object and wrap it in a shared_ptr*/ + template + void Setup(Args&& ... args) { + ls = std::make_shared(args...); + + /*Make an initial guess strategy if we dont have one setup yet*/ + if (ig==nullptr) { + MakeDefaultInitialGuessStrategy(); + } + } + + /*Generic setup. Create a InitialGuess object and wrap it in a shared_ptr*/ + template + void SetupInitialGuess(Args&& ... args) { + ig = std::make_shared(args...); + } + + int Solve(operator_t& linearOperator, operator_t& precon, + deviceMemory& o_x, deviceMemory& o_rhs, + const dfloat tol, const int MAXIT, const int verbose); + + private: + std::shared_ptr ls=nullptr; + std::shared_ptr ig=nullptr; + + void MakeDefaultInitialGuessStrategy(); + + void assertInitialized(); +}; + + +namespace LinearSolver { + +//virtual base linear solver class +class linearSolverBase_t { public: - platform_t& platform; - settings_t& settings; - MPI_Comm comm; + platform_t platform; + settings_t settings; + comm_t comm; dlong N; dlong Nhalo; - linearSolver_t(dlong _N, dlong _Nhalo, - platform_t& _platform, settings_t& _settings, MPI_Comm _comm): + linearSolverBase_t(dlong _N, dlong _Nhalo, + platform_t& _platform, settings_t& _settings, comm_t _comm): platform(_platform), settings(_settings), comm(_comm), N(_N), Nhalo(_Nhalo) {} - static linearSolver_t* Setup(dlong _N, dlong _Nhalo, - platform_t& platform, settings_t& settings, MPI_Comm _comm); - - virtual int Solve(solver_t& solver, precon_t& precon, - occa::memory& o_x, occa::memory& o_rhs, + virtual int Solve(operator_t& linearOperator, operator_t& precon, + deviceMemory& o_x, deviceMemory& o_rhs, const dfloat tol, const int MAXIT, const int verbose)=0; - - virtual ~linearSolver_t(){} }; //Preconditioned Conjugate Gradient -class pcg: public linearSolver_t { +class pcg: public linearSolverBase_t { private: - occa::memory o_p, o_Ap, o_z, o_Ax; + deviceMemory o_p, o_Ap, o_z, o_Ax; - dfloat* tmprdotr; - occa::memory h_tmprdotr; - occa::memory o_tmprdotr; + pinnedMemory rdotr; + deviceMemory o_rdotr; int flexible; - occa::kernel updatePCGKernel; + kernel_t updatePCGKernel; - dfloat UpdatePCG(const dfloat alpha, occa::memory &o_x, occa::memory &o_r); + dfloat UpdatePCG(const dfloat alpha, deviceMemory& o_x, deviceMemory& o_r); public: pcg(dlong _N, dlong _Nhalo, - platform_t& _platform, settings_t& _settings, MPI_Comm _comm); - ~pcg(); + platform_t& _platform, settings_t& _settings, comm_t _comm); - int Solve(solver_t& solver, precon_t& precon, - occa::memory& o_x, occa::memory& o_rhs, + int Solve(operator_t& linearOperator, operator_t& precon, + deviceMemory& o_x, deviceMemory& o_rhs, const dfloat tol, const int MAXIT, const int verbose); }; //Preconditioned GMRES -class pgmres: public linearSolver_t { +class pgmres: public linearSolverBase_t { private: - occa::memory *o_V=nullptr; - occa::memory o_Ax, o_z, o_r; + deviceMemory o_Ax, o_z, o_r; + memory> o_V; int restart; - dfloat *H=nullptr, *sn=nullptr, *cs=nullptr, *s=nullptr, *y=nullptr; + memory H, sn, cs, s, y; - void UpdateGMRES(occa::memory& o_x, const int I); + void UpdateGMRES(deviceMemory& o_x, const int I); public: pgmres(dlong _N, dlong _Nhalo, - platform_t& _platform, settings_t& _settings, MPI_Comm _comm); - ~pgmres(); + platform_t& _platform, settings_t& _settings, comm_t _comm); - int Solve(solver_t& solver, precon_t& precon, - occa::memory& o_x, occa::memory& o_rhs, + int Solve(operator_t& linearOperator, operator_t& precon, + deviceMemory& o_x, deviceMemory& o_rhs, const dfloat tol, const int MAXIT, const int verbose); }; // Preconditioned MINRES -class pminres : public linearSolver_t { +class pminres : public linearSolverBase_t { private: - occa::memory o_p; - occa::memory o_z; - occa::memory o_r; - occa::memory o_r_old; - occa::memory o_q; - occa::memory o_q_old; + deviceMemory o_p; + deviceMemory o_z; + deviceMemory o_r; + deviceMemory o_r_old; + deviceMemory o_q; + deviceMemory o_q_old; - occa::kernel updateMINRESKernel; + kernel_t updateMINRESKernel; - dfloat innerProd(occa::memory& o_x, occa::memory& o_y); + dfloat innerProd(deviceMemory& o_x, deviceMemory& o_y); void UpdateMINRES(const dfloat ma2, const dfloat ma3, const dfloat alpha, const dfloat beta); public: pminres(dlong _N, dlong _Nhalo, - platform_t& _platform, settings_t& _settings, MPI_Comm _comm); - ~pminres(); + platform_t& _platform, settings_t& _settings, comm_t _comm); - int Solve(solver_t& solver, precon_t& precon, - occa::memory& o_x, occa::memory& o_rhs, + int Solve(operator_t& linearOperator, operator_t& precon, + deviceMemory& o_x, deviceMemory& o_rhs, const dfloat tol, const int MAXIT, const int verbose); }; //Non-Blocking Preconditioned Conjugate Gradient -class nbpcg: public linearSolver_t { +class nbpcg: public linearSolverBase_t { private: - occa::memory o_p, o_s, o_S, o_z, o_Z, o_Ax; + deviceMemory o_p, o_s, o_S, o_z, o_Z, o_Ax; - dfloat* tmpdots; - occa::memory h_tmpdots; - occa::memory o_tmpdots; + pinnedMemory dots; + deviceMemory o_dots; - occa::kernel update1NBPCGKernel; - occa::kernel update2NBPCGKernel; + kernel_t update1NBPCGKernel; + kernel_t update2NBPCGKernel; - dfloat *localdots, *globaldots; - - MPI_Request request; - MPI_Status status; + Comm::request_t request; void Update1NBPCG(const dfloat beta); - void Update2NBPCG(const dfloat alpha, occa::memory &o_r); + void Update2NBPCG(const dfloat alpha, deviceMemory& o_r); public: nbpcg(dlong _N, dlong _Nhalo, - platform_t& _platform, settings_t& _settings, MPI_Comm _comm); - ~nbpcg(); + platform_t& _platform, settings_t& _settings, comm_t _comm); - int Solve(solver_t& solver, precon_t& precon, - occa::memory& o_x, occa::memory& o_rhs, + int Solve(operator_t& linearOperator, operator_t& precon, + deviceMemory& o_x, deviceMemory& o_rhs, const dfloat tol, const int MAXIT, const int verbose); }; //Non-Blocking Flexible Preconditioned Conjugate Gradient -class nbfpcg: public linearSolver_t { +class nbfpcg: public linearSolverBase_t { private: - occa::memory o_u, o_p, o_w, o_n, o_m, o_s, o_z, o_q, o_Ax; - - dfloat* tmpdots; - occa::memory h_tmpdots; - occa::memory o_tmpdots; + deviceMemory o_u, o_p, o_w, o_n, o_m, o_s, o_z, o_q, o_Ax; - occa::kernel update0NBFPCGKernel; - occa::kernel update1NBFPCGKernel; + pinnedMemory dots; + deviceMemory o_dots; - dfloat *localdots, *globaldots; + kernel_t update0NBFPCGKernel; + kernel_t update1NBFPCGKernel; - MPI_Request request; - MPI_Status status; + Comm::request_t request; - void Update0NBFPCG(occa::memory &o_r); - void Update1NBFPCG(const dfloat alpha, occa::memory &o_x, occa::memory &o_r); + void Update0NBFPCG(deviceMemory& o_r); + void Update1NBFPCG(const dfloat alpha, deviceMemory& o_x, deviceMemory& o_r); public: nbfpcg(dlong _N, dlong _Nhalo, - platform_t& _platform, settings_t& _settings, MPI_Comm _comm); - ~nbfpcg(); + platform_t& _platform, settings_t& _settings, comm_t _comm); - int Solve(solver_t& solver, precon_t& precon, - occa::memory& o_x, occa::memory& o_rhs, + int Solve(operator_t& linearOperator, operator_t& precon, + deviceMemory& o_x, deviceMemory& o_rhs, const dfloat tol, const int MAXIT, const int verbose); }; +} //namespace LinearSolver + +} //namespace libp + #endif diff --git a/include/memory.hpp b/include/memory.hpp new file mode 100644 index 000000000..7232c897e --- /dev/null +++ b/include/memory.hpp @@ -0,0 +1,778 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#ifndef LIBP_MEMORY_HPP +#define LIBP_MEMORY_HPP + +#include "utils.hpp" + +namespace libp { + +template +class memory { + template friend class memory; + + private: + using size_t = std::size_t; + using ptrdiff_t = std::ptrdiff_t; + + std::shared_ptr shrdPtr; + size_t lngth; + size_t offset; + + public: + memory() : + lngth{0}, + offset{0} {} + + memory(const size_t lngth_) : + shrdPtr(new T[lngth_]), + lngth{lngth_}, + offset{0} {} + + memory(const size_t lngth_, + const T val) : + shrdPtr(new T[lngth_]), + lngth{lngth_}, + offset{0} { + #pragma omp parallel for + for (size_t i=0;i + memory(const memory &m): + shrdPtr{std::reinterpret_pointer_cast(m.shrdPtr)}, + lngth{m.lngth*sizeof(T)/sizeof(U)}, + offset{m.offset*sizeof(T)/sizeof(U)} { + // Check that this conversion made sense + LIBP_ABORT("libp::memory type conversion failed. Trying to convert " + << m.lngth << " " << sizeof(T) << "-byte words to " + << lngth << " " << sizeof(U) << "-byte words.", + lngth*sizeof(U) != m.lngth*sizeof(T)); + + LIBP_ABORT("libp::memory type conversion failed. Source memory has offset at " + << m.lngth << " " << sizeof(T) << "-byte words, destination memory would have offset at" + << lngth << " " << sizeof(U) << "-byte words.", + offset*sizeof(U) != m.offset*sizeof(T)); + } + + memory(const memory &m)=default; + memory& operator = (const memory &m)=default; + ~memory()=default; + + void malloc(const size_t lngth_) { + *this = memory(lngth_); + } + + void malloc(const size_t lngth_, const T val) { + *this = memory(lngth_, val); + } + + void calloc(const size_t lngth_) { + *this = memory(lngth_, T{0}); + } + + void realloc(const size_t lngth_) { + memory m(lngth_); + const ptrdiff_t cnt = std::min(lngth, lngth_); + m.copyFrom(*this, cnt); + *this = m; + } + + memory& swap(memory &m) { + std::swap(shrdPtr, m.shrdPtr); + std::swap(lngth, m.lngth); + std::swap(offset, m.offset); + return *this; + } + + T* ptr() { + return shrdPtr.get()+offset; + } + const T* ptr() const { + return shrdPtr.get()+offset; + } + + T* begin() {return ptr();} + T* end() {return ptr() + length();} + + size_t length() const { + return lngth; + } + + size_t size() const { + return lngth*sizeof(T); + } + + size_t use_count() const { + return shrdPtr.use_count(); + } + + T& operator[](const ptrdiff_t idx) const { + return shrdPtr[idx+offset]; + } + + bool operator == (const memory &other) const { + return (shrdPtr==other.shrdPtr && offset==other.offset); + } + bool operator != (const memory &other) const { + return (shrdPtr!=other.shrdPtr || offset!=other.offset); + } + + memory operator + (const ptrdiff_t offset_) const { + return slice(offset_); + } + memory& operator += (const ptrdiff_t offset_) { + *this = slice(offset_); + return *this; + } + + memory slice(const ptrdiff_t offset_, + const ptrdiff_t count = -1) const { + memory m(*this); + m.offset = offset + offset_; + m.lngth = (count==-1) + ? (lngth - offset_) + : count; + return m; + } + + /*Copy from raw ptr*/ + void copyFrom(const T* src, + const ptrdiff_t count = -1, + const ptrdiff_t offset_ = 0) { + + const ptrdiff_t cnt = (count==-1) ? lngth : count; + + LIBP_ABORT("libp::memory::copyFrom Cannot have negative count (" + << cnt << ")", + cnt < 0); + LIBP_ABORT("libp::memory::copyFrom Cannot have negative offset (" + << offset_ << ")", + offset_ < 0); + LIBP_ABORT("libp::memory::copyFrom Destination memory has size [" << lngth << "]," + << " trying to access [" << offset_ << ", " << offset_+static_cast(cnt) << "]", + static_cast(cnt)+offset_ > lngth); + + std::copy(src, + src+cnt, + ptr()+offset_); + } + + /*Copy from memory*/ + void copyFrom(const memory src, + const ptrdiff_t count = -1, + const ptrdiff_t offset_ = 0) { + const ptrdiff_t cnt = (count==-1) ? lngth : count; + + LIBP_ABORT("libp::memory::copyFrom Cannot have negative count (" + << cnt << ")", + cnt < 0); + LIBP_ABORT("libp::memory::copyFrom Cannot have negative offset (" + << offset_ << ")", + offset_ < 0); + LIBP_ABORT("libp::memory::copyFrom Source memory has size [" << src.length() << "]," + << " trying to access [0, " << static_cast(cnt) << "]", + static_cast(cnt) > src.length()); + LIBP_ABORT("libp::memory::copyFrom Destination memory has size [" << lngth << "]," + << " trying to access [" << offset_ << ", " << offset_+static_cast(cnt) << "]", + static_cast(cnt)+offset_ > lngth); + + std::copy(src.ptr(), + src.ptr()+cnt, + ptr()+offset_); + } + + /*Copy to raw pointer*/ + void copyTo(T *dest, + const ptrdiff_t count = -1, + const ptrdiff_t offset_ = 0) const { + const ptrdiff_t cnt = (count==-1) ? lngth : count; + + LIBP_ABORT("libp::memory::copyTo Cannot have negative count (" + << cnt << ")", + cnt < 0); + LIBP_ABORT("libp::memory::copyTo Cannot have negative offset (" + << offset_ << ")", + offset_ < 0); + LIBP_ABORT("libp::memory::copyTo Source memory has size [" << lngth << "]," + << " trying to access [" << offset_ << ", " << offset_+static_cast(cnt) << "]", + static_cast(cnt)+offset_ > lngth); + + std::copy(ptr()+offset_, + ptr()+offset_+cnt, + dest); + } + + /*Copy to memory*/ + void copyTo(memory dest, + const ptrdiff_t count = -1, + const ptrdiff_t offset_ = 0) const { + const ptrdiff_t cnt = (count==-1) ? lngth : count; + + LIBP_ABORT("libp::memory::copyTo Cannot have negative count (" + << cnt << ")", + cnt < 0); + LIBP_ABORT("libp::memory::copyTo Cannot have negative offset (" + << offset_ << ")", + offset_ < 0); + LIBP_ABORT("libp::memory::copyTo Destination memory has size [" << dest.length() << "]," + << " trying to access [0, " << cnt << "]", + static_cast(cnt) > dest.length()); + LIBP_ABORT("libp::memory::copyTo Source memory has size [" << lngth << "]," + << " trying to access [" << offset_ << ", " << offset_+static_cast(cnt) << "]", + static_cast(cnt)+offset_ > lngth); + + std::copy(ptr()+offset_, + ptr()+offset_+cnt, + dest.ptr()); + } + + memory clone() const { + memory m(lngth); + m.copyFrom(*this); + return m; + } + + void free() { + shrdPtr = nullptr; + lngth=0; + offset=0; + } +}; + +template +std::ostream& operator << (std::ostream &out, + const memory &memory) { + out << "memory - " + << "type: " << typeid(T).name() << ", " + << "ptr : " << memory.ptr() << ", " + << "length : " << memory.length() << ", " + << "use_count : " << memory.use_count(); + return out; +} + +/*Extern declare common instantiations for faster compilation*/ +extern template class memory; +extern template class memory; +extern template class memory; +extern template class memory; + +/*libp::deviceMemory is a wrapper around occa::memory*/ +template +class deviceMemory: public occa::memory { + public: + deviceMemory() = default; + deviceMemory(const deviceMemory &m)=default; + deviceMemory(occa::memory m): + occa::memory(m) + { + if (isInitialized()) { + if (occa::dtype::get() == occa::dtype::none) { + occa::memory::setDtype(occa::dtype::byte); + } else { + occa::memory::setDtype(occa::dtype::get()); + } + } + } + + /*Conversion constructor*/ + template + deviceMemory(const deviceMemory &m): + occa::memory(m) + { + if (isInitialized()) { + if (occa::dtype::get() == occa::dtype::none) { + occa::memory::setDtype(occa::dtype::byte); + } else { + occa::memory::setDtype(occa::dtype::get()); + } + } + } + + deviceMemory& operator = (const deviceMemory &m)=default; + ~deviceMemory()=default; + + T* ptr() { + return static_cast(occa::memory::ptr()); + } + const T* ptr() const { + return static_cast(occa::memory::ptr()); + } + + size_t length() const { + return size()/sizeof(T); + } + + T& operator[](const ptrdiff_t idx) { + return ptr()[idx]; + } + + deviceMemory operator + (const ptrdiff_t offset) const { + if (isInitialized()) + return deviceMemory(occa::memory::operator+(offset)); + else + return deviceMemory(); + } + + deviceMemory& operator += (const ptrdiff_t offset) { + *this = deviceMemory(occa::memory::slice(offset)); + return *this; + } + + /*Copy from libp::memory*/ + void copyFrom(const libp::memory src, + const ptrdiff_t count = -1, + const ptrdiff_t offset = 0, + const properties_t &props = properties_t()) { + const ptrdiff_t cnt = (count==-1) ? length() : count; + + if (cnt==0) return; + + LIBP_ABORT("libp::memory::copyFrom Source memory has size [" << src.length() << "]," + << " trying to access [0, " << static_cast(cnt) << "]", + static_cast(cnt) > src.length()); + + occa::memory::copyFrom(src.ptr(), + cnt*sizeof(T), + offset*sizeof(T), + props); + } + + void copyFrom(const libp::memory src, + const properties_t &props) { + + if (length()==0) return; + + LIBP_ABORT("libp::memory::copyFrom Source memory has size [" << src.length() << "]," + << " trying to access [0, " << length() << "]", + length() > src.length()); + + occa::memory::copyFrom(src.ptr(), + length()*sizeof(T), + 0, + props); + } + + /*Copy from libp::deviceMemory*/ + void copyFrom(const deviceMemory src, + const ptrdiff_t count = -1, + const ptrdiff_t offset = 0, + const properties_t &props = properties_t()) { + const ptrdiff_t cnt = (count==-1) ? length() : count; + + if (cnt==0) return; + + occa::memory::copyFrom(src, + cnt*sizeof(T), + offset*sizeof(T), + 0, + props); + } + + void copyFrom(const deviceMemory src, + const properties_t &props) { + + if (length()==0) return; + + occa::memory::copyFrom(src, + length()*sizeof(T), + 0, + 0, + props); + } + + /*Copy to libp::memory*/ + void copyTo(libp::memory dest, + const ptrdiff_t count = -1, + const ptrdiff_t offset = 0, + const properties_t &props = properties_t()) const { + const ptrdiff_t cnt = (count==-1) ? length() : count; + + if (cnt==0) return; + + LIBP_ABORT("libp::memory::copyTo Destination memory has size [" << dest.length() << "]," + << " trying to access [0, " << static_cast(cnt) << "]", + static_cast(cnt) > dest.length()); + + occa::memory::copyTo(dest.ptr(), + cnt*sizeof(T), + offset*sizeof(T), + props); + } + + void copyTo(libp::memory dest, + const properties_t &props) const { + + if (length()==0) return; + + LIBP_ABORT("libp::memory::copyTo Destination memory has size [" << dest.length() << "]," + << " trying to access [0, " << length() << "]", + length() > dest.length()); + + occa::memory::copyTo(dest.ptr(), + length()*sizeof(T), + 0, + props); + } + + /*Copy to libp::deviceMemory*/ + void copyTo(deviceMemory dest, + const ptrdiff_t count = -1, + const ptrdiff_t offset = 0, + const properties_t &props = properties_t()) const { + const ptrdiff_t cnt = (count==-1) ? length() : count; + + if (cnt==0) return; + + occa::memory::copyTo(dest, + cnt*sizeof(T), + 0, + offset*sizeof(T), + props); + } + + void copyTo(deviceMemory dest, + const properties_t &props) const { + + if (length()==0) return; + + occa::memory::copyTo(dest, + length()*sizeof(T), + 0, + 0, + props); + } +}; + +/*Extern declare common instantiations for faster compilation*/ +extern template class deviceMemory; +extern template class deviceMemory; +extern template class deviceMemory; +extern template class deviceMemory; + +/*libp::pinnedMemory is another wrapper around occa::memory, + but is allocated slightly differently*/ +template +class pinnedMemory: public occa::memory { + public: + pinnedMemory() = default; + pinnedMemory(const pinnedMemory &m)=default; + pinnedMemory(occa::memory m): + occa::memory(m) + { + if (isInitialized()) { + if (occa::dtype::get() == occa::dtype::none) { + occa::memory::setDtype(occa::dtype::byte); + } else { + occa::memory::setDtype(occa::dtype::get()); + } + } + }; + + /*Conversion constructor*/ + template + pinnedMemory(const pinnedMemory &m): + occa::memory(m) + { + if (isInitialized()) { + if (occa::dtype::get() == occa::dtype::none) { + occa::memory::setDtype(occa::dtype::byte); + } else { + occa::memory::setDtype(occa::dtype::get()); + } + } + } + + pinnedMemory& operator = (const pinnedMemory &m)=default; + ~pinnedMemory()=default; + + T* ptr() { + return static_cast(occa::memory::ptr()); + } + const T* ptr() const { + return static_cast(occa::memory::ptr()); + } + + size_t length() const { + return size()/sizeof(T); + } + + T& operator[](const ptrdiff_t idx) { + return ptr()[idx]; + } + + pinnedMemory operator + (const ptrdiff_t offset) const { + if (isInitialized()) + return pinnedMemory(occa::memory::operator+(offset)); + else + return pinnedMemory(); + } + + pinnedMemory& operator += (const ptrdiff_t offset) { + *this = pinnedMemory(occa::memory::slice(offset)); + return *this; + } + + /*Copy from raw pointer*/ + void copyFrom(const T* src, + const ptrdiff_t count = -1, + const ptrdiff_t offset = 0, + const properties_t &props = properties_t()) { + const ptrdiff_t cnt = (count==-1) ? length() : count; + + if (cnt==0) return; + + occa::memory::copyFrom(src, + cnt*sizeof(T), + offset*sizeof(T), + props); + } + + void copyFrom(const T* src, + const properties_t &props) { + + if (length()==0) return; + + occa::memory::copyFrom(src, + length()*sizeof(T), + 0, + props); + } + + /*Copy from libp::memory*/ + void copyFrom(const libp::memory src, + const ptrdiff_t count = -1, + const ptrdiff_t offset = 0, + const properties_t &props = properties_t()) { + const ptrdiff_t cnt = (count==-1) ? length() : count; + + if (cnt==0) return; + + LIBP_ABORT("libp::memory::copyFrom Source memory has size [" << src.length() << "]," + << " trying to access [0, " << static_cast(cnt) << "]", + static_cast(cnt) > src.length()); + + occa::memory::copyFrom(src.ptr(), + cnt*sizeof(T), + offset*sizeof(T), + props); + } + + void copyFrom(const libp::memory src, + const properties_t &props) { + + if (length()==0) return; + + LIBP_ABORT("libp::memory::copyFrom Source memory has size [" << src.length() << "]," + << " trying to access [0, " << length() << "]", + length() > src.length()); + + occa::memory::copyFrom(src.ptr(), + length()*sizeof(T), + 0, + props); + } + + /*Copy from libp::deviceMemory*/ + void copyFrom(const deviceMemory src, + const ptrdiff_t count = -1, + const ptrdiff_t offset = 0, + const properties_t &props = properties_t()) { + const ptrdiff_t cnt = (count==-1) ? length() : count; + + if (cnt==0) return; + + occa::memory::copyFrom(src, + cnt*sizeof(T), + offset*sizeof(T), + 0, + props); + } + + void copyFrom(const deviceMemory src, + const properties_t &props) { + + if (length()==0) return; + + occa::memory::copyFrom(src, + length()*sizeof(T), + 0, + 0, + props); + } + + /*Copy from libp::pinnedMemory*/ + void copyFrom(const pinnedMemory src, + const ptrdiff_t count = -1, + const ptrdiff_t offset = 0, + const properties_t &props = properties_t()) { + const ptrdiff_t cnt = (count==-1) ? length() : count; + + if (cnt==0) return; + + occa::memory::copyFrom(src, + cnt*sizeof(T), + offset*sizeof(T), + 0, + props); + } + + void copyFrom(const pinnedMemory src, + const properties_t &props) { + + if (length()==0) return; + + occa::memory::copyFrom(src, + length()*sizeof(T), + 0, + 0, + props); + } + + /*Copy to raw pointer*/ + void copyTo(T* dest, + const ptrdiff_t count = -1, + const ptrdiff_t offset = 0, + const properties_t &props = properties_t()) const { + const ptrdiff_t cnt = (count==-1) ? length() : count; + + if (cnt==0) return; + + occa::memory::copyTo(dest, + cnt*sizeof(T), + offset*sizeof(T), + props); + } + + void copyTo(T* dest, + const properties_t &props) const { + + if (length()==0) return; + + occa::memory::copyTo(dest, + length()*sizeof(T), + 0, + props); + } + + /*Copy to libp::memory*/ + void copyTo(libp::memory dest, + const ptrdiff_t count = -1, + const ptrdiff_t offset = 0, + const properties_t &props = properties_t()) const { + const ptrdiff_t cnt = (count==-1) ? length() : count; + + if (cnt==0) return; + + LIBP_ABORT("libp::memory::copyTo Destination memory has size [" << dest.length() << "]," + << " trying to access [0, " << static_cast(cnt) << "]", + static_cast(cnt) > dest.length()); + + occa::memory::copyTo(dest.ptr(), + cnt*sizeof(T), + offset*sizeof(T), + props); + } + + void copyTo(libp::memory dest, + const properties_t &props) const { + + if (length()==0) return; + + LIBP_ABORT("libp::memory::copyTo Destination memory has size [" << dest.length() << "]," + << " trying to access [0, " << length() << "]", + length() > dest.length()); + + occa::memory::copyTo(dest.ptr(), + length()*sizeof(T), + 0, + props); + } + + /*Copy to libp::deviceMemory*/ + void copyTo(deviceMemory dest, + const ptrdiff_t count = -1, + const ptrdiff_t offset = 0, + const properties_t &props = properties_t()) const { + const ptrdiff_t cnt = (count==-1) ? length() : count; + + if (cnt==0) return; + + occa::memory::copyTo(dest, + cnt*sizeof(T), + 0, + offset*sizeof(T), + props); + } + + void copyTo(deviceMemory dest, + const properties_t &props) const { + + if (length()==0) return; + + occa::memory::copyTo(dest, + length()*sizeof(T), + 0, + 0, + props); + } + + /*Copy to libp::pinnedMemory*/ + void copyTo(pinnedMemory dest, + const ptrdiff_t count = -1, + const ptrdiff_t offset = 0, + const properties_t &props = properties_t()) const { + const ptrdiff_t cnt = (count==-1) ? length() : count; + + if (cnt==0) return; + + occa::memory::copyTo(dest, + cnt*sizeof(T), + 0, + offset*sizeof(T), + props); + } + + void copyTo(pinnedMemory dest, + const properties_t &props) const { + + if (length()==0) return; + + occa::memory::copyTo(dest, + length()*sizeof(T), + 0, + 0, + props); + } +}; + +} //namespace libp + +#endif diff --git a/include/mesh.hpp b/include/mesh.hpp index 4e58b790b..8b1d18ce8 100644 --- a/include/mesh.hpp +++ b/include/mesh.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -28,494 +28,1076 @@ SOFTWARE. #define MESH_HPP 1 #include "core.hpp" +#include "platform.hpp" #include "settings.hpp" #include "ogs.hpp" -#define TRIANGLES 3 -#define QUADRILATERALS 4 -#define TETRAHEDRA 6 -#define HEXAHEDRA 12 +namespace libp { class meshSettings_t: public settings_t { public: - meshSettings_t(MPI_Comm& _comm); + meshSettings_t() = default; + meshSettings_t(comm_t _comm); void report(); }; -class mesh_t { -public: - platform_t& platform; - meshSettings_t& settings; +namespace Mesh { + /*Element types*/ + enum ElementType { + TRIANGLES =3, + QUADRILATERALS=4, + TETRAHEDRA =6, + HEXAHEDRA =12 + }; +} //namespace Mesh - occa::properties props; +class mesh_t { + public: + platform_t platform; + meshSettings_t settings; + properties_t props; - MPI_Comm comm; + comm_t comm; int rank, size; + /*************************/ + /* Element Data */ + /*************************/ int dim; int Nverts, Nfaces, NfaceVertices; + Mesh::ElementType elementType; // indices of vertex nodes - int *vertexNodes; - - int elementType; + memory vertexNodes; hlong Nnodes=0; //global number of element vertices - dfloat *EX; // coordinates of vertices for each element - dfloat *EY; - dfloat *EZ; + memory EX; // coordinates of vertices for each element + memory EY; + memory EZ; dlong Nelements=0; //local element count hlong NelementsGlobal=0; //global element count - hlong *EToV; // element-to-vertex connectivity - dlong *EToE; // element-to-element connectivity - int *EToF; // element-to-(local)face connectivity - int *EToP; // element-to-partition/process connectivity - int *EToB; // element-to-boundary condition type + memory EToV; // element-to-vertex connectivity + memory EToE; // element-to-element connectivity + memory EToF; // element-to-(local)face connectivity + memory EToP; // element-to-partition/process connectivity + memory EToB; // element-to-boundary condition type + deviceMemory o_EToB; + + memory mapB; // node-to-boundary condition type + deviceMemory o_mapB; + + memory elementInfo; //type of element - hlong *elementInfo; //type of element + memory VmapM; // list of vertices on each face + memory VmapP; // list of vertices that are paired with face vertices // boundary faces hlong NboundaryFaces=0; // number of boundary faces - hlong *boundaryInfo; // list of boundary faces (type, vertex-1, vertex-2, vertex-3) + memory boundaryInfo; // list of boundary faces (type, vertex-1, vertex-2, vertex-3) + + /*************************/ + /* FEM Space */ + /*************************/ + int N=0, Np=0; // N = Polynomial order and Np = Nodes per element + memory r, s, t; // coordinates of local nodes + + int Nq=0; // N = Polynomial order, Nq=N+1 + memory gllz; // 1D GLL quadrature nodes + memory gllw; // 1D GLL quadrature weights + + // face node info + int Nfp=0; // number of nodes per face + memory faceNodes; // list of element reference interpolation nodes on element faces + memory faceVertices; // list of mesh vertices on each face + + /*************************/ + /* FEM Operators */ + /*************************/ + memory Dr, Ds, Dt; // collocation differentiation matrices + memory D; + deviceMemory o_D; + memory MM, invMM; // reference mass matrix + deviceMemory o_MM; + memory LIFT; // lift matrix + deviceMemory o_LIFT; + memory sM; // surface mass (MM*LIFT)^T + deviceMemory o_sM; + memory Srr, Srs, Srt; //element stiffness matrices + memory Ssr, Sss, Sst; + memory Str, Sts, Stt; + memory S; + deviceMemory o_S; + + /*************************/ + /* Cubature */ + /*************************/ + // cubature + int cubN=0, cubNp=0, cubNfp=0, cubNq=0; + memory cubr, cubs, cubt, cubw; // coordinates and weights of local cubature nodes + + memory cubInterp; // interpolate from W&B to cubature nodes + deviceMemory o_cubInterp; + memory cubProject; // projection matrix from cubature nodes to W&B nodes + deviceMemory o_cubProject; + memory cubD; // 1D differentiation matrix + deviceMemory o_cubD; + memory cubPDrT, cubPDsT, cubPDtT; // weak differentiation matrices + memory cubPDT; // packed weak differentiation matrices + deviceMemory o_cubPDT; + // surface integration node info + int intNfp=0; // number of integration nodes on each face + memory intr, ints, intw; + memory intInterp; // interp from surface node to integration nodes + deviceMemory o_intInterp; + memory intLIFT; // lift from surface integration nodes to W&B volume nodes + deviceMemory o_intLIFT; + + /*************************/ + /* Plotting */ + /*************************/ + // ploting info for generating field vtu + int plotN=0; + int plotNq=0; + int plotNp=0; + int plotNverts; // number of vertices for each plot element + int plotNelements; // number of "plot elements" per element + memory plotEToV; // triangulation of plot nodes + memory plotR, plotS, plotT; // coordinates of plot nodes in reference element + memory plotInterp; // reference to plot node interpolation matrix + + /*************************/ + /* Physical Space */ + /*************************/ + // volume node info + memory x, y, z; // coordinates of physical nodes + deviceMemory o_x, o_y, o_z; // coordinates of physical nodes + + memory vmapM; // list of volume nodes that are face nodes + deviceMemory o_vmapM; + memory vmapP; // list of volume nodes that are paired with face nodes + deviceMemory o_vmapP; + memory mapP; // list of surface nodes that are paired with -ve surface nodes + deviceMemory o_mapP; + + // Jacobian + memory wJ; + deviceMemory o_wJ; + // volumeGeometricFactors; + dlong Nvgeo; + memory vgeo; + deviceMemory o_vgeo; + // surfaceGeometricFactors; + dlong Nsgeo; + memory sgeo; + deviceMemory o_sgeo; + // second order volume geometric factors + dlong Nggeo; + memory ggeo; + deviceMemory o_ggeo; + + memory cubx, cuby, cubz; // coordinates of physical nodes + deviceMemory o_cubx, o_cuby, o_cubz; + memory intx, inty, intz; // coordinates of suface integration nodes + deviceMemory o_intx, o_inty, o_intz; + + memory cubwJ; //Jacobian at cubature points + deviceMemory o_cubwJ; + memory cubvgeo; //volume geometric data at cubature points + deviceMemory o_cubvgeo; + memory cubsgeo; //surface geometric data at cubature points + deviceMemory o_cubsgeo; + memory cubggeo; //second type volume geometric data at cubature points + deviceMemory o_cubggeo; + + /*************************/ + /* MPI Data */ + /*************************/ // MPI halo exchange info - halo_t *halo; // halo exchange pointer - halo_t *ringHalo; // ring halo exchange pointer + ogs::halo_t halo; // halo exchange pointer + ogs::halo_t ringHalo; // ring halo exchange pointer dlong NinternalElements=0; // number of elements that can update without halo exchange dlong NhaloElements=0; // number of elements that cannot update without halo exchange dlong totalHaloPairs=0; // number of elements to be received in halo exchange dlong totalRingElements=0;// number of elements to be received in ring halo exchange - dlong *internalElementIds; // list of elements that can update without halo exchange - dlong *haloElementIds; // list of elements to be sent in halo exchange - occa::memory o_internalElementIds; // list of elements that can update without halo exchange - occa::memory o_haloElementIds; // list of elements to be sent in halo exchange + + memory internalElementIds; // list of elements that can update without halo exchange + memory haloElementIds; // list of elements to be sent in halo exchange + deviceMemory o_internalElementIds; // list of elements that can update without halo exchange + deviceMemory o_haloElementIds; // list of elements to be sent in halo exchange // CG gather-scatter info - ogs_t *ogs; //occa gs pointer - hlong *globalIds; + ogs::ogs_t ogs; //occa gs pointer + memory globalIds; // list of elements that are needed for global gather-scatter - dlong NglobalGatherElements=0; - dlong *globalGatherElementList; - occa::memory o_globalGatherElementList; + dlong NglobalGatherElements; + memory globalGatherElementList; + deviceMemory o_globalGatherElementList; // list of elements that are not needed for global gather-scatter - dlong NlocalGatherElements=0; - dlong *localGatherElementList; - occa::memory o_localGatherElementList; - - // volumeGeometricFactors; - dlong Nvgeo=0; - dfloat *vgeo; - - // second order volume geometric factors - dlong Nggeo=0; - dfloat *ggeo; - - // volume node info - int N=0, Nq=0, Np=0; // N = Polynomial order, Nq=N+1, and Np = Nodes per element - dfloat *r, *s, *t; // coordinates of reference nodes - dfloat *w; // quadrature weights (1d quadrature for tensor prod elements) - dfloat *MM, *invMM; // reference mass matrix - - dfloat *Dr, *Ds, *Dt; // collocation differentiation matrices - dfloat *D; // packed collocation differentiation matrices, - // or 1D derivative for quads and hexes - - dfloat *Srr,*Srs, *Srt; //element stiffness matrices - dfloat *Sss,*Sst, *Stt; - dfloat *S; // packed element stiffness matrices - - dfloat *x, *y, *z; // coordinates of physical nodes - - /* GeoData for affine mapped elements */ - /* NC: disabling until we re-add treatment of affine elements - dfloat *EXYZ; // element vertices for reconstructing geofacs - dfloat *gllzw; // GLL nodes and weights - dfloat *ggeoNoJW; - occa::memory o_EXYZ; - occa::memory o_gllzw; - occa::memory o_ggeoNoJW; - */ - - // face node info - int Nfp=0; // number of nodes per face - int *faceNodes; // list of element reference interpolation nodes on element faces - dlong *vmapM; // list of volume nodes that are face nodes - dlong *vmapP; // list of volume nodes that are paired with face nodes - dlong *mapP; // list of surface nodes that are paired with -ve surface nodes - int *faceVertices; // list of mesh vertices on each face - - dfloat *LIFT; // lift matrix - dfloat *sM; // surface mass MM*LIFT - - dlong Nsgeo=0; - dfloat *sgeo; - - // cubature - int cubN=0, cubNp=0, cubNfp=0, cubNq=0; - dfloat *cubr, *cubs, *cubt, *cubw; // coordinates and weights of reference cubature nodes - dfloat *cubx, *cuby, *cubz; // coordinates of physical cubature nodes - dfloat *cubInterp; // interpolate from W&B to cubature nodes - dfloat *cubProject; // projection matrix from cubature nodes to W&B nodes - dfloat *cubD; // packed differentiation matrices - dfloat *cubPDT; // packed weak differentiation matrices - dfloat *cubPDrT, *cubPDsT, *cubPDtT; // weak differentiation matrices - - dfloat *cubvgeo; //volume geometric data at cubature points - dfloat *cubsgeo; //surface geometric data at cubature points - dfloat *cubggeo; //second type volume geometric data at cubature points - - // surface integration node info - int intNfp=0; // number of integration nodes on each face - dfloat *intr, *ints, *intw; - dfloat *intInterp; // interp from surface node to integration nodes - dfloat *intLIFT; // lift from surface integration nodes to W&B volume nodes - dfloat *intx, *inty, *intz; // coordinates of suface integration nodes + dlong NlocalGatherElements; + memory localGatherElementList; + deviceMemory o_localGatherElementList; + /*************************/ + /* PML */ + /*************************/ //pml lists dlong NnonPmlElements=0; dlong NpmlElements=0; - dlong *pmlElements; - dlong *nonPmlElements; - dlong *pmlIds; + memory pmlElements; + deviceMemory o_pmlElements; + memory nonPmlElements; + deviceMemory o_nonPmlElements; + memory pmlIds; + deviceMemory o_pmlIds; + + /*************************/ + /* Multirate timestepping*/ + /*************************/ //multirate lists int mrNlevels=0; - int *mrLevel; - dlong *mrNelements, *mrInterfaceNelements; - dlong **mrElements, **mrInterfaceElements; + memory mrLevel; + deviceMemory o_mrLevel; - //multirate pml lists - dlong *mrNnonPmlElements, *mrNpmlElements; - dlong **mrPmlElements, **mrNonPmlElements; - dlong **mrPmlIds; - - // plotting info for generating field vtu - int plotNverts=0; // number of vertices for each plot element - int plotN=0; // degree of plot interpolation - int plotNq=0; // plotNq = plotN+1 - int plotNp=0; // number of plot nodes per element - int plotNelements=0; // number of "plot elements" per element - int *plotEToV; // triangulation of plot nodes - dfloat *plotR, *plotS, *plotT; // coordinates of plot nodes in reference element - dfloat *plotInterp; // reference to plot node interpolation matrix + memory mrNelements, mrInterfaceNelements; + deviceMemory o_mrNelements, o_mrInterfaceNelements; + memory mrNnonPmlElements, mrNpmlElements; + + memory> mrElements, mrInterfaceElements; + memory> o_mrElements, o_mrInterfaceElements; + + //multirate pml lists + memory> mrPmlElements, mrNonPmlElements; + memory> o_mrPmlElements, o_mrNonPmlElements; + memory> mrPmlIds; + memory> o_mrPmlIds; + + /*************************/ + /* SEMFEM */ + /*************************/ //SEMFEM data int NpFEM=0, NelFEM=0; - int *FEMEToV; - dfloat *rFEM, *sFEM, *tFEM; - dfloat *SEMFEMInterp; + memory FEMEToV; + memory rFEM, sFEM, tFEM; + memory SEMFEMInterp; + deviceMemory o_SEMFEMInterp; + deviceMemory o_SEMFEMAnterp; - // occa stuff - occa::memory o_SEMFEMInterp; - occa::memory o_SEMFEMAnterp; + kernel_t MassMatrixKernel; - occa::memory o_MM; // Mass matrix - occa::memory o_D; // packed differentiation matricies (contains the transpose 1d D matrix for quads/hexes) - occa::memory o_S; // packed stiffness matricies - occa::memory o_LIFT;// Surface lift matrix - occa::memory o_sM; // Surface mass - - // volume, surface, and second order geometric factors - occa::memory o_vgeo, o_sgeo, o_ggeo; + mesh_t() = default; + mesh_t(platform_t& _platform, meshSettings_t& _settings, + comm_t _comm) { + Setup(_platform, _settings, _comm); + } - //face node mappings - occa::memory o_vmapM, o_vmapP, o_mapP; + // mesh setup + void Setup(platform_t& _platform, meshSettings_t& _settings, + comm_t _comm); - //element boundary mappings - occa::memory o_EToB; + // setup trace halo + void HaloRingSetup(); - //physical coordinates - occa::memory o_x, o_y, o_z; + // setup trace halo + ogs::halo_t HaloTraceSetup(int Nfields); - // cubature - occa::memory o_cubInterp, o_cubProject; //cubature interpolationm and projection - occa::memory o_cubPDT, o_cubD; // weak cubature derivatives, and cubature derivatives - occa::memory o_intLIFT, o_intInterp; + //Setup PML elements + void PmlSetup(); + void MultiRatePmlSetup(); - //physical cubature coordinates - occa::memory o_cubx, o_cuby, o_cubz; + //Multirate partitioning + void MultiRateSetup(memory EToDT); - //physical surface cubature coordinates - occa::memory o_intx, o_inty, o_intz; + // Multirate trace halo + memory MultiRateHaloTraceSetup(int Nfields); + + // Setup cubature + void CubatureSetup() { + switch (elementType) { + case Mesh::TRIANGLES: + CubatureSetupTri2D(); + break; + case Mesh::QUADRILATERALS: + CubatureSetupQuad2D(); + break; + case Mesh::TETRAHEDRA: + CubatureSetupTet3D(); + break; + case Mesh::HEXAHEDRA: + CubatureSetupHex3D(); + break; + } + } + + // Setup cubature physical nodes + void CubaturePhysicalNodes() { + switch (elementType) { + case Mesh::TRIANGLES: + if (dim==2) + CubaturePhysicalNodesTri2D(); + else + CubaturePhysicalNodesTri3D(); + break; + case Mesh::QUADRILATERALS: + if (dim==2) + CubaturePhysicalNodesQuad2D(); + else + CubaturePhysicalNodesQuad3D(); + break; + case Mesh::TETRAHEDRA: + CubaturePhysicalNodesTet3D(); + break; + case Mesh::HEXAHEDRA: + CubaturePhysicalNodesHex3D(); + break; + } + } - // volume, surface, and second order geometric factors at cubature points - occa::memory o_cubvgeo, o_cubsgeo, o_cubggeo; + dfloat MinCharacteristicLength(); - //pml lists - occa::memory o_pmlElements; - occa::memory o_nonPmlElements; - occa::memory o_pmlIds; + void PlotInterp(const memory q, memory Iq, memory scratch=memory()) { + switch (elementType) { + case Mesh::TRIANGLES: + PlotInterpTri2D(q, Iq, scratch); + break; + case Mesh::QUADRILATERALS: + PlotInterpQuad2D(q, Iq, scratch); + break; + case Mesh::TETRAHEDRA: + PlotInterpTet3D(q, Iq, scratch); + break; + case Mesh::HEXAHEDRA: + PlotInterpHex3D(q, Iq, scratch); + break; + } + } + + void MassMatrixApply(deviceMemory& o_q, deviceMemory& o_Mq); + void MassMatrixKernelSetup(int Nfields) { + switch (elementType) { + case Mesh::TRIANGLES: + MassMatrixKernelSetupTri2D(Nfields); + break; + case Mesh::QUADRILATERALS: + MassMatrixKernelSetupQuad2D(Nfields); + break; + case Mesh::TETRAHEDRA: + MassMatrixKernelSetupTet3D(Nfields); + break; + case Mesh::HEXAHEDRA: + MassMatrixKernelSetupHex3D(Nfields); + break; + } + } + + dfloat ElementCharacteristicLength(dlong e) { + switch (elementType) { + case Mesh::TRIANGLES: + return ElementCharacteristicLengthTri2D(e); + case Mesh::QUADRILATERALS: + return ElementCharacteristicLengthQuad2D(e); + case Mesh::TETRAHEDRA: + return ElementCharacteristicLengthTet3D(e); + case Mesh::HEXAHEDRA: + return ElementCharacteristicLengthHex3D(e); + default: + return 0.0; + } + } - //multirate lists - occa::memory o_mrLevel; - occa::memory o_mrNelements, o_mrInterfaceNelements; - occa::memory *o_mrElements, *o_mrInterfaceElements; + //create a new mesh object with the same geometry, but different degree + mesh_t SetupNewDegree(int Nf); - //multirate pml lists - occa::memory *o_mrPmlElements, *o_mrNonPmlElements; - occa::memory *o_mrPmlIds; + mesh_t SetupRingPatch(); - occa::kernel MassMatrixKernel; + mesh_t SetupSEMFEM(memory& globalIds, memory& mapB); - mesh_t() = delete; - mesh_t(platform_t& _platform, meshSettings_t& _settings, - MPI_Comm _comm); + int RXID, RYID, RZID; + int SXID, SYID, SZID; + int TXID, TYID, TZID; + int JID, JWID, IJWID; + int G00ID, G01ID, G02ID, G11ID, G12ID, G22ID; - virtual ~mesh_t(); + int NXID, NYID, NZID; + int SJID, IJID, IHID, WIJID, WSJID; - // generic mesh setup - static mesh_t& Setup(platform_t& _platform, meshSettings_t& _settings, - MPI_Comm _comm); + private: + /*Set the type of mesh*/ + void SetElementType(const Mesh::ElementType eType); // box mesh - virtual void SetupBox() = 0; + void SetupBox() { + switch (elementType) { + case Mesh::TRIANGLES: + SetupBoxTri2D(); + break; + case Mesh::QUADRILATERALS: + SetupBoxQuad2D(); + break; + case Mesh::TETRAHEDRA: + SetupBoxTet3D(); + break; + case Mesh::HEXAHEDRA: + SetupBoxHex3D(); + break; + } + } + void SetupBoxTri2D(); + void SetupBoxQuad2D(); + void SetupBoxTet3D(); + void SetupBoxHex3D(); // pml box mesh - virtual void SetupPmlBox() = 0; + void SetupPmlBox() { + switch (elementType) { + case Mesh::TRIANGLES: + SetupPmlBoxTri2D(); + break; + case Mesh::QUADRILATERALS: + SetupPmlBoxQuad2D(); + break; + case Mesh::TETRAHEDRA: + SetupPmlBoxTet3D(); + break; + case Mesh::HEXAHEDRA: + SetupPmlBoxHex3D(); + break; + } + } + void SetupPmlBoxTri2D(); + void SetupPmlBoxQuad2D(); + void SetupPmlBoxTet3D(); + void SetupPmlBoxHex3D(); // mesh reader - virtual void ParallelReader(const char *fileName) = 0; - - // repartition elements in parallel - virtual void GeometricPartition() = 0; + void ReadGmsh(const std::string fileName) { + switch (elementType) { + case Mesh::TRIANGLES: + if(dim==2) + ReadGmshTri2D(fileName); + else + ReadGmshTri3D(fileName); + break; + case Mesh::QUADRILATERALS: + if(dim==2) + ReadGmshQuad2D(fileName); + else + ReadGmshQuad3D(fileName); + break; + case Mesh::TETRAHEDRA: + ReadGmshTet3D(fileName); + break; + case Mesh::HEXAHEDRA: + ReadGmshHex3D(fileName); + break; + } + } + void ReadGmshTri2D(const std::string fileName); + void ReadGmshTri3D(const std::string fileName); + void ReadGmshQuad2D(const std::string fileName); + void ReadGmshQuad3D(const std::string fileName); + void ReadGmshTet3D(const std::string fileName); + void ReadGmshHex3D(const std::string fileName); + + // reference nodes and operators + void ReferenceNodes() { + switch (elementType) { + case Mesh::TRIANGLES: + ReferenceNodesTri2D(); + break; + case Mesh::QUADRILATERALS: + ReferenceNodesQuad2D(); + break; + case Mesh::TETRAHEDRA: + ReferenceNodesTet3D(); + break; + case Mesh::HEXAHEDRA: + ReferenceNodesHex3D(); + break; + } + } + void ReferenceNodesTri2D(); + void ReferenceNodesQuad2D(); + void ReferenceNodesTet3D(); + void ReferenceNodesHex3D(); + + // repartition elements + void Partition(); /* build parallel face connectivity */ - void ParallelConnect(); void Connect(); // build element-boundary connectivity void ConnectBoundary(); - virtual void ReferenceNodes(int N) = 0; + // face-vertex to face-vertex connection + void ConnectFaceVertices(); - /* compute x,y,z coordinates of each node */ - virtual void PhysicalNodes() = 0; - - // compute geometric factors for local to physical map - virtual void GeometricFactors() = 0; - - virtual void SurfaceGeometricFactors() = 0; - - // serial face-node to face-node connection - virtual void ConnectFaceNodes() = 0; + // face-node to face-node connection + void ConnectFaceNodes(); // setup halo region void HaloSetup(); - // setup trace halo - void HaloRingSetup(); - - // setup trace halo - halo_t* HaloTraceSetup(int Nfields); - /* build global connectivity in parallel */ - void ParallelConnectNodes(); + void ConnectNodes(); /* build global gather scatter ops */ - void ParallelGatherScatterSetup(); + void GatherScatterSetup(); - //Setup PML elements - void PmlSetup(); - void MultiRatePmlSetup(); - - //Multirate partitioning - void MultiRateSetup(dfloat *EToDT); - - // Multirate trace halo - halo_t** MultiRateHaloTraceSetup(int Nfields); - - virtual void OccaSetup(); - - virtual void CubatureSetup()=0; - - virtual void CubatureNodes()=0; - - // print out parallel partition i - void PrintPartitionStatistics(); - - virtual dfloat ElementCharacteristicLength(dlong e) = 0; - - dfloat MinCharacteristicLength(); - - virtual void PlotInterp(const dfloat* q, dfloat* Iq, dfloat* scratch=nullptr)=0; - - void RecursiveSpectralBisectionPartition(); - - void MassMatrixApply(occa::memory& o_q, occa::memory& o_Mq); - virtual void MassMatrixKernelSetup(int Nfields)=0; - - //create a new mesh object with the same geometry, but different degree - mesh_t& SetupNewDegree(int Nf); - - mesh_t* SetupRingPatch(); - - mesh_t* SetupSEMFEM(hlong **globalIds, int *Nfp, int **faceNodes); + /* compute x,y,z coordinates of each node */ + void PhysicalNodes() { + switch (elementType) { + case Mesh::TRIANGLES: + if(dim==2) + PhysicalNodesTri2D(); + else + PhysicalNodesTri3D(); + break; + case Mesh::QUADRILATERALS: + if(dim==2) + PhysicalNodesQuad2D(); + else + PhysicalNodesQuad3D(); + break; + case Mesh::TETRAHEDRA: + PhysicalNodesTet3D(); + break; + case Mesh::HEXAHEDRA: + PhysicalNodesHex3D(); + break; + } + } + void PhysicalNodesTri2D(); + void PhysicalNodesTri3D(); + void PhysicalNodesQuad2D(); + void PhysicalNodesQuad3D(); + void PhysicalNodesTet3D(); + void PhysicalNodesHex3D(); - void DegreeRaiseMatrix1D(int Nc, int Nf, dfloat *P); - void DegreeRaiseMatrixTri2D(int Nc, int Nf, dfloat *P); - void DegreeRaiseMatrixTet3D(int Nc, int Nf, dfloat *P); + // compute geometric factors for local to physical map + void GeometricFactors() { + switch (elementType) { + case Mesh::TRIANGLES: + if(dim==2) + GeometricFactorsTri2D(); + else + GeometricFactorsTri3D(); + break; + case Mesh::QUADRILATERALS: + if(dim==2) + GeometricFactorsQuad2D(); + else + GeometricFactorsQuad3D(); + break; + case Mesh::TETRAHEDRA: + GeometricFactorsTet3D(); + break; + case Mesh::HEXAHEDRA: + GeometricFactorsHex3D(); + break; + } + } + void GeometricFactorsTri2D(); + void GeometricFactorsTri3D(); + void GeometricFactorsQuad2D(); + void GeometricFactorsQuad3D(); + void GeometricFactorsTet3D(); + void GeometricFactorsHex3D(); + + void SurfaceGeometricFactors() { + switch (elementType) { + case Mesh::TRIANGLES: + if(dim==2) + SurfaceGeometricFactorsTri2D(); + else + SurfaceGeometricFactorsTri3D(); + break; + case Mesh::QUADRILATERALS: + if(dim==2) + SurfaceGeometricFactorsQuad2D(); + else + SurfaceGeometricFactorsQuad3D(); + break; + case Mesh::TETRAHEDRA: + SurfaceGeometricFactorsTet3D(); + break; + case Mesh::HEXAHEDRA: + SurfaceGeometricFactorsHex3D(); + break; + } + } + void SurfaceGeometricFactorsTri2D(); + void SurfaceGeometricFactorsTri3D(); + void SurfaceGeometricFactorsQuad2D(); + void SurfaceGeometricFactorsQuad3D(); + void SurfaceGeometricFactorsTet3D(); + void SurfaceGeometricFactorsHex3D(); + + void CubatureSetupTri2D(); + void CubatureSetupQuad2D(); + void CubatureSetupTet3D(); + void CubatureSetupHex3D(); + + void CubaturePhysicalNodesTri2D(); + void CubaturePhysicalNodesTri3D(); + void CubaturePhysicalNodesQuad2D(); + void CubaturePhysicalNodesQuad3D(); + void CubaturePhysicalNodesTet3D(); + void CubaturePhysicalNodesHex3D(); + + void PlotInterpTri2D(const memory q, memory Iq, memory scratch); + void PlotInterpQuad2D(const memory q, memory Iq, memory scratch); + void PlotInterpTet3D(const memory q, memory Iq, memory scratch); + void PlotInterpHex3D(const memory q, memory Iq, memory scratch); + + void MassMatrixKernelSetupTri2D(int Nfields); + void MassMatrixKernelSetupQuad2D(int Nfields); + void MassMatrixKernelSetupTet3D(int Nfields); + void MassMatrixKernelSetupHex3D(int Nfields); + + dfloat ElementCharacteristicLengthTri2D(dlong e); + dfloat ElementCharacteristicLengthQuad2D(dlong e); + dfloat ElementCharacteristicLengthTet3D(dlong e); + dfloat ElementCharacteristicLengthHex3D(dlong e); /***************************************************************************/ // Basic codes for generating nodes, polynomials, matrices, etc. -public: + public: //1D - static void Nodes1D(int N, dfloat *r); - static void EquispacedNodes1D(int _N, dfloat *_r); - static void OrthonormalBasis1D(dfloat a, int i, dfloat *P); - static void GradOrthonormalBasis1D(dfloat a, int i, dfloat *Pr); - static void Vandermonde1D(int N, int Npoints, dfloat *r, dfloat *V); - static void GradVandermonde1D(int N, int Npoints, dfloat *r, dfloat *Vr); - - static void MassMatrix1D(int _Np, dfloat *V, dfloat *MM); - static void Dmatrix1D(int _N, int NpointsIn, dfloat *_rIn, int NpointsOut, dfloat *_rOut, dfloat *_Dr); - static void InterpolationMatrix1D(int _N,int NpointsIn, dfloat *rIn, int NpointsOut, dfloat *rOut, dfloat *I); - static void CubatureWeakDmatrix1D(int _Nq, int _cubNq, dfloat *_cubProject, dfloat *_cubD, dfloat *_cubPDT); + static void Nodes1D(const int _N, memory& _r); + static void EquispacedNodes1D(const int _N, memory& _r); + static void OrthonormalBasis1D(const dfloat a, const int i, dfloat& P); + static void GradOrthonormalBasis1D(const dfloat a, const int i, dfloat& Pr); + static void Vandermonde1D(const int _N, + const memory _r, + memory& V); + static void GradVandermonde1D(const int _N, + const memory _r, + memory& Vr); + + static void MassMatrix1D(const int _Np, + const memory V, + memory& _MM); + static void Dmatrix1D(const int _N, + const memory _rIn, + const memory _rOut, + memory& _Dr); + static void InterpolationMatrix1D(const int _N, + const memory _rIn, + const memory _rOut, + memory& I); + static void DegreeRaiseMatrix1D(const int Nc, const int Nf, + memory& P); + static void CubatureWeakDmatrix1D(const int _Nq, const int _cubNq, + const memory _cubProject, + const memory _cubD, + memory& _cubPDT); //Jacobi polynomial evaluation - static dfloat JacobiP(dfloat a, dfloat alpha, dfloat beta, int N); - static dfloat GradJacobiP(dfloat a, dfloat alpha, dfloat beta, int N); + static dfloat JacobiP(const dfloat a, const dfloat alpha, + const dfloat beta, const int _N); + static dfloat GradJacobiP(const dfloat a, const dfloat alpha, + const dfloat beta, const int _N); //Gauss-Legendre-Lobatto quadrature nodes - static void JacobiGLL(int N, dfloat *x, dfloat *w=NULL); + static void JacobiGLL(const int _N, + memory& _x); + static void JacobiGLL(const int _N, + memory& _x, + memory& _w); //Nth order Gauss-Jacobi quadrature nodes and weights - static void JacobiGQ(dfloat alpha, dfloat beta, int N, dfloat *x, dfloat *w); + static void JacobiGQ(const dfloat alpha, const dfloat beta, + const int _N, + memory& _x, + memory& _w); //Tris - static void NodesTri2D(int _N, dfloat *_r, dfloat *_s); - static void FaceNodesTri2D(int _N, dfloat *_r, dfloat *_s, int *_faceNodes); - static void VertexNodesTri2D(int _N, dfloat *_r, dfloat *_s, int *_vertexNodes); - static void EquispacedNodesTri2D(int _N, dfloat *_r, dfloat *_s); - static void EquispacedEToVTri2D(int _N, int *_EToV); - static void SEMFEMNodesTri2D(int _N, int *_Np, dfloat **_r, dfloat **_s); - static void SEMFEMEToVTri2D(int _N, int *_NelFEM, int **_EToV); - static void OrthonormalBasisTri2D(dfloat a, dfloat b, int i, int j, dfloat *P); - static void GradOrthonormalBasisTri2D(dfloat a, dfloat b, int i, int j, dfloat *Pr, dfloat *Ps); - static void VandermondeTri2D(int N, int Npoints, dfloat *r, dfloat *s, dfloat *V); - static void GradVandermondeTri2D(int N, int Npoints, dfloat *r, dfloat *s, dfloat *Vr, dfloat *Vs); - static void MassMatrixTri2D(int _Np, dfloat *V, dfloat *_MM); - static void invMassMatrixTri2D(int _Np, dfloat *V, dfloat *_invMM); - static void DmatrixTri2D(int _N, int Npoints, dfloat *_r, dfloat *_s, - dfloat *_Dr, dfloat *_Ds); - static void LIFTmatrixTri2D(int _N, int *_faceNodes, - dfloat *_r, dfloat *_s, dfloat *_LIFT); - static void SurfaceMassMatrixTri2D(int _N, dfloat *_MM, dfloat *_LIFT, dfloat *_sM); - static void SmatrixTri2D(int _N, dfloat *_Dr, dfloat *_Ds, dfloat *_MM, - dfloat *_Srr, dfloat *_Srs, dfloat *_Sss); - static void InterpolationMatrixTri2D(int _N, - int NpointsIn, dfloat *rIn, dfloat *sIn, - int NpointsOut, dfloat *rOut, dfloat *sOut, - dfloat *I); - static void CubatureNodesTri2D(int cubTriN, int*cubNp, dfloat **cubTrir, dfloat **cubTris, dfloat **cubTriw); - static void CubaturePmatrixTri2D(int _N, int _Np, dfloat *_r, dfloat *_s, - int _cubNp, dfloat *_cubr, dfloat *_cubs, dfloat *_cubProject); - static void CubatureWeakDmatricesTri2D(int _N, int _Np, dfloat *_r, dfloat *_s, - int _cubNp, dfloat *_cubr, dfloat *_cubs, - dfloat *_cubPDrT, dfloat *_cubPDsT); - static void CubatureSurfaceMatricesTri2D(int _N, int _Np, dfloat *_r, dfloat *_s, int *_faceNodes, - int _intNfp, dfloat *_intr, dfloat *_intw, - dfloat *_intInterp, dfloat *_intLIFT); - static void SEMFEMInterpMatrixTri2D(int _N, - int _Np, dfloat *_r, dfloat *_s, - int _NpFEM, dfloat *rFEM, dfloat *sFEM, - dfloat *I); - - static void Warpfactor(int _N, int Npoints, dfloat *r, dfloat *w); - static void WarpBlendTransformTri2D(int _N, int _Npoints, dfloat *_r, dfloat *_s, dfloat alphaIn=-1); + static void NodesTri2D(const int _N, + memory& _r, + memory& _s); + static void FaceNodesTri2D(const int _N, + const memory _r, + const memory _s, + memory& _faceNodes); + static void VertexNodesTri2D(const int _N, + const memory _r, + const memory _s, + memory& _vertexNodes); + static void FaceNodeMatchingTri2D(const memory _r, + const memory _s, + const memory _faceNodes, + const memory _faceVertices, + memory& R); + static void EquispacedNodesTri2D(const int _N, + memory& _r, + memory& _s); + static void EquispacedEToVTri2D(const int _N, memory& _EToV); + static void SEMFEMNodesTri2D(const int _N, + int& _Np, + memory& _r, + memory& _s); + static void SEMFEMEToVTri2D(const int _N, + int& _NelFEM, + memory& _EToV); + static void OrthonormalBasisTri2D(const dfloat _r, const dfloat _s, + const int i, const int j, + dfloat& P); + static void GradOrthonormalBasisTri2D(const dfloat _r, const dfloat _s, + const int i, const int j, + dfloat& Pr, dfloat& Ps); + static void VandermondeTri2D(const int _N, + const memory _r, + const memory _s, + memory& V); + static void GradVandermondeTri2D(const int _N, + const memory _r, + const memory _s, + memory& Vr, + memory& Vs); + static void MassMatrixTri2D(const int _Np, + const memory V, + memory& _MM); + static void invMassMatrixTri2D(const int _Np, + const memory V, + memory& _invMM); + static void DmatrixTri2D(const int _N, + const memory _r, + const memory _s, + memory& _D); + static void LIFTmatrixTri2D(const int _N, + const memory _faceNodes, + const memory _r, + const memory _s, + memory& _LIFT); + static void SurfaceMassMatrixTri2D(const int _N, + const memory _MM, + const memory _LIFT, + memory& _sM); + static void SmatrixTri2D(const int _N, + const memory _Dr, + const memory _Ds, + const memory _MM, + memory& _S); + static void InterpolationMatrixTri2D(const int _N, + const memory rIn, + const memory sIn, + const memory rOut, + const memory sOut, + memory& I); + static void DegreeRaiseMatrixTri2D(const int Nc, const int Nf, + memory& P); + static void CubatureNodesTri2D(const int cubTriN, + int& _cubNp, + memory& cubTrir, + memory& cubTris, + memory& cubTriw); + static void CubaturePmatrixTri2D(const int _N, + const memory _r, + const memory _s, + const memory _cubr, + const memory _cubs, + memory& _cubProject); + static void CubatureWeakDmatricesTri2D(const int _N, + const memory _r, + const memory _s, + const memory _cubr, + const memory _cubs, + memory& _cubPDT); + static void CubatureSurfaceMatricesTri2D(const int _N, + const memory _r, + const memory _s, + const memory _faceNodes, + const memory _intr, + const memory _intw, + memory& _intInterp, + memory& _intLIFT); + static void SEMFEMInterpMatrixTri2D(const int _N, + const memory _r, + const memory _s, + const memory _rFEM, + const memory _sFEM, + memory& I); + + static void Warpfactor(const int _N, + const memory _r, + memory warp); + static void WarpBlendTransformTri2D(const int _N, + memory _r, + memory _s, + const dfloat alphaIn=-1); //Quads - static void NodesQuad2D(int _N, dfloat *_r, dfloat *_s); - static void FaceNodesQuad2D(int _N, dfloat *_r, dfloat *_s, int *_faceNodes); - static void VertexNodesQuad2D(int _N, dfloat *_r, dfloat *_s, int *_vertexNodes); - static void EquispacedNodesQuad2D(int _N, dfloat *_r, dfloat *_s); - static void EquispacedEToVQuad2D(int _N, int *_EToV); - static void SEMFEMEToVQuad2D(int _N, int *_EToV); - static void OrthonormalBasisQuad2D(dfloat a, dfloat b, int i, int j, dfloat *P); - static void GradOrthonormalBasisQuad2D(dfloat a, dfloat b, int i, int j, dfloat *Pr, dfloat *Ps); - static void VandermondeQuad2D(int N, int Npoints, dfloat *r, dfloat *s, dfloat *V); - static void GradVandermondeQuad2D(int N, int Npoints, dfloat *r, dfloat *s, dfloat *Vr, dfloat *Vs); - static void MassMatrixQuad2D(int _Np, dfloat *V, dfloat *_MM); - static void LumpedMassMatrixQuad2D(int _N, dfloat *_gllw, dfloat *_MM); - static void invLumpedMassMatrixQuad2D(int _N, dfloat *_gllw, dfloat *_invMM); - static void DmatrixQuad2D(int _N, int Npoints, dfloat *_r, dfloat *_s, - dfloat *_Dr, dfloat *_Ds); - static void InterpolationMatrixQuad2D(int _N, - int NpointsIn, dfloat *rIn, dfloat *sIn, - int NpointsOut, dfloat *rOut, dfloat *sOut, - dfloat *I); + static void NodesQuad2D(const int _N, + memory& _r, + memory& _s); + static void FaceNodesQuad2D(const int _N, + const memory _r, + const memory _s, + memory& _faceNodes); + static void VertexNodesQuad2D(const int _N, + const memory _r, + const memory _s, + memory& _vertexNodes); + static void FaceNodeMatchingQuad2D(const memory _r, + const memory _s, + const memory _faceNodes, + const memory _faceVertices, + memory& R); + static void EquispacedNodesQuad2D(const int _N, + memory& _r, + memory& _s); + static void EquispacedEToVQuad2D(const int _N, memory& _EToV); + static void SEMFEMEToVQuad2D(const int _N, memory& _EToV); + static void OrthonormalBasisQuad2D(const dfloat a, const dfloat b, + const int i, const int j, + dfloat& P); + static void GradOrthonormalBasisQuad2D(const dfloat a, const dfloat b, + const int i, const int j, + dfloat& Pr, dfloat& Ps); + static void VandermondeQuad2D(const int _N, + const memory _r, + const memory _s, + memory& V); + static void GradVandermondeQuad2D(const int _N, + const memory _r, + const memory _s, + memory& Vr, + memory& Vs); + static void MassMatrixQuad2D(const int _Np, + const memory V, + memory& _MM); + static void LumpedMassMatrixQuad2D(const int _N, + const memory _gllw, + memory& _MM); + static void invLumpedMassMatrixQuad2D(const int _N, + const memory _gllw, + memory& _invMM); + static void DmatrixQuad2D(const int _N, + const memory _r, + const memory _s, + memory& _D); + static void InterpolationMatrixQuad2D(const int _N, + const memory rIn, + const memory sIn, + const memory rOut, + const memory sOut, + memory& I); //Tets - static void NodesTet3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t); - static void FaceNodesTet3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_faceNodes); - static void VertexNodesTet3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_vertexNodes); - static void EquispacedNodesTet3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t); - static void EquispacedEToVTet3D(int _N, int *_EToV); - static void SEMFEMEToVTet3D(int _N, int *_EToV); - static void OrthonormalBasisTet3D(dfloat a, dfloat b, dfloat c, int i, int j, int k, dfloat *P); - static void GradOrthonormalBasisTet3D(dfloat a, dfloat b, dfloat c, int i, int j, int k, dfloat *Pr, dfloat *Ps, dfloat *Pt); - static void VandermondeTet3D(int N, int Npoints, dfloat *r, dfloat *s, dfloat *t, dfloat *V); - static void GradVandermondeTet3D(int N, int Npoints, dfloat *r, dfloat *s, dfloat *t, dfloat *Vr, dfloat *Vs, dfloat *Vt); - static void MassMatrixTet3D(int _Np, dfloat *V, dfloat *_MM); - static void invMassMatrixTet3D(int _Np, dfloat *V, dfloat *_invMM); - static void DmatrixTet3D(int _N, int Npoints, dfloat *_r, dfloat *_s, dfloat *_t, - dfloat *_Dr, dfloat *_Ds, dfloat *_Dt); - static void LIFTmatrixTet3D(int _N, int *_faceNodes, - dfloat *_r, dfloat *_s, dfloat *_t, dfloat *_LIFT); - static void SurfaceMassMatrixTet3D(int _N, dfloat *_MM, dfloat *_LIFT, dfloat *_sM); - static void SmatrixTet3D(int _N, dfloat *_Dr, dfloat *_Ds, dfloat *_Dt, dfloat *_MM, - dfloat *_Srr, dfloat *_Srs, dfloat *_Srt, - dfloat *_Sss, dfloat *_Sst, dfloat *_Stt); - static void InterpolationMatrixTet3D(int _N, - int NpointsIn, dfloat *rIn, dfloat *sIn, dfloat *tIn, - int NpointsOut, dfloat *rOut, dfloat *sOut, dfloat *tOut, - dfloat *I); - static void CubatureNodesTet3D(int cubN, int*cubNp, dfloat **cubr, dfloat **cubs, dfloat **cubt, dfloat **cubw); - static void CubaturePmatrixTet3D(int _N, int _Np, dfloat *_r, dfloat *_s, dfloat *_t, - int _cubNp, dfloat *_cubr, dfloat *_cubs, dfloat *_cubt, - dfloat *_cubProject); - static void CubatureWeakDmatricesTet3D(int _N, int _Np, dfloat *_r, dfloat *_s, dfloat *_t, - int _cubNp, dfloat *_cubr, dfloat *_cubs, dfloat *_cubt, - dfloat *_cubPDrT, dfloat *_cubPDsT, dfloat *_cubPDtT); - static void CubatureSurfaceMatricesTet3D(int _N, int _Np, dfloat *_r, dfloat *_s, dfloat *_t, int *_faceNodes, - int _intNfp, dfloat *_intr, dfloat *_ints, dfloat *_intw, - dfloat *_intInterp, dfloat *_intLIFT); - static void SEMFEMInterpMatrixTet3D(int _N, int _Np, dfloat *_r, dfloat *_s, dfloat *_t, - int _NpFEM, dfloat *_rFEM, dfloat *_sFEM, dfloat *_tFEM, - dfloat *I); - static void WarpShiftFace3D(int _N, int Npoints, dfloat alpha, - dfloat *L1, dfloat *L2, dfloat *L3, - dfloat *w1, dfloat *w2); - static void WarpBlendTransformTet3D(int _N, int _Npoints, dfloat *_r, dfloat *_s, dfloat *_t, dfloat alphaIn=-1); + static void NodesTet3D(const int _N, + memory& _r, + memory& _s, + memory& _t); + static void FaceNodesTet3D(const int _N, + const memory _r, + const memory _s, + const memory _t, + memory& _faceNodes); + static void VertexNodesTet3D(const int _N, + const memory _r, + const memory _s, + const memory _t, + memory& _vertexNodes); + static void FaceNodeMatchingTet3D(const memory _r, + const memory _s, + const memory _t, + const memory _faceNodes, + const memory _faceVertices, + memory& R); + static void EquispacedNodesTet3D(const int _N, + memory& _r, + memory& _s, + memory& _t); + static void EquispacedEToVTet3D(const int _N, memory& _EToV); + static void SEMFEMEToVTet3D(const int _N, memory& _EToV); + static void OrthonormalBasisTet3D(const dfloat _r, const dfloat _s, const dfloat _t, + const int i, const int j, const int k, + dfloat& P); + static void GradOrthonormalBasisTet3D(const dfloat _r, const dfloat _s, const dfloat _t, + const int i, const int j, const int k, + dfloat& Pr, dfloat& Ps, dfloat& Pt); + static void VandermondeTet3D(const int _N, + const memory _r, + const memory _s, + const memory _t, + memory& V); + static void GradVandermondeTet3D(const int _N, + const memory _r, + const memory _s, + const memory _t, + memory& Vr, + memory& Vs, + memory& Vt); + static void MassMatrixTet3D(const int _Np, + const memory V, + memory& _MM); + static void invMassMatrixTet3D(const int _Np, + const memory V, + memory& _invMM); + static void DmatrixTet3D(const int _N, + const memory _r, + const memory _s, + const memory _t, + memory& _D); + static void LIFTmatrixTet3D(const int _N, + const memory _faceNodes, + const memory _r, + const memory _s, + const memory _t, + memory& _LIFT); + static void SurfaceMassMatrixTet3D(const int _N, + const memory _MM, + const memory _LIFT, + memory& _sM); + static void SmatrixTet3D(const int _N, + const memory _Dr, + const memory _Ds, + const memory _Dt, + const memory _MM, + memory& _S); + static void InterpolationMatrixTet3D(const int _N, + const memory rIn, + const memory sIn, + const memory tIn, + const memory rOut, + const memory sOut, + const memory tOut, + memory& I); + static void DegreeRaiseMatrixTet3D(const int Nc, const int Nf, + memory& P); + static void CubatureNodesTet3D(const int cubTetN, + int& _cubNp, + memory& _cubr, + memory& _cubs, + memory& _cubt, + memory& _cubw); + static void CubaturePmatrixTet3D(const int _N, + const memory _r, + const memory _s, + const memory _t, + const memory _cubr, + const memory _cubs, + const memory _cubt, + memory& _cubProject); + static void CubatureWeakDmatricesTet3D(const int _N, + const memory _r, + const memory _s, + const memory _t, + const memory _cubr, + const memory _cubs, + const memory _cubt, + memory& _cubPDT); + static void CubatureSurfaceMatricesTet3D(const int _N, + const memory _r, + const memory _s, + const memory _t, + const memory _faceNodes, + const memory _intr, + const memory _ints, + const memory _intw, + memory& _intInterp, + memory& _intLIFT); + static void SEMFEMInterpMatrixTet3D(const int _N, + const memory _r, + const memory _s, + const memory _t, + const memory _rFEM, + const memory _sFEM, + const memory _tFEM, + memory& I); + static void WarpShiftFace3D(const int _N, const dfloat alpha, + const memory L1, + const memory L2, + const memory L3, + memory w1, + memory w2); + static void WarpBlendTransformTet3D(const int _N, + memory _r, + memory _s, + memory _t, + const dfloat alphaIn=-1); //Hexs - static void NodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t); - static void FaceNodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_faceNodes); - static void VertexNodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_vertexNodes); - static void EquispacedNodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t); - static void EquispacedEToVHex3D(int _N, int *_EToV); - static void SEMFEMEToVHex3D(int _N, int *_EToV); - static void OrthonormalBasisHex3D(dfloat a, dfloat b, dfloat c, int i, int j, int k, dfloat *P); - static void GradOrthonormalBasisHex3D(dfloat a, dfloat b, dfloat c, int i, int j, int k, dfloat *Pr, dfloat *Ps, dfloat *Pt); - static void VandermondeHex3D(int N, int Npoints, dfloat *r, dfloat *s, dfloat *t, dfloat *V); - static void GradVandermondeHex3D(int N, int Npoints, dfloat *r, dfloat *s, dfloat *t, - dfloat *Vr, dfloat *Vs, dfloat *Vt); - static void MassMatrixHex3D(int _Np, dfloat *V, dfloat *_MM); - static void LumpedMassMatrixHex3D(int _N, dfloat *_gllw, dfloat *_MM); - static void invLumpedMassMatrixHex3D(int _N, dfloat *_gllw, dfloat *_invMM); - static void DmatrixHex3D(int _N, int Npoints, dfloat *_r, dfloat *_s, dfloat *_t, - dfloat *_Dr, dfloat *_Ds, dfloat *_Dt); - static void InterpolationMatrixHex3D(int _N, - int NpointsIn, dfloat *rIn, dfloat *sIn, dfloat *tIn, - int NpointsOut, dfloat *rOut, dfloat *sOut, dfloat *tOut, - dfloat *I); + static void NodesHex3D(const int _N, + memory& _r, + memory& _s, + memory& _t); + static void FaceNodesHex3D(const int _N, + const memory _r, + const memory _s, + const memory _t, + memory& _faceNodes); + static void VertexNodesHex3D(const int _N, + const memory _r, + const memory _s, + const memory _t, + memory& _vertexNodes); + static void FaceNodeMatchingHex3D(const memory _r, + const memory _s, + const memory _t, + const memory _faceNodes, + const memory _faceVertices, + memory& R); + static void EquispacedNodesHex3D(const int _N, + memory& _r, + memory& _s, + memory& _t); + static void EquispacedEToVHex3D(const int _N, memory& _EToV); + static void SEMFEMEToVHex3D(const int _N, memory& _EToV); + static void OrthonormalBasisHex3D(const dfloat a, const dfloat b, const dfloat c, + const int i, const int j, const int k, + dfloat& P); + static void GradOrthonormalBasisHex3D(const dfloat a, const dfloat b, const dfloat c, + const int i, const int j, const int k, + dfloat& Pr, dfloat& Ps, dfloat& Pt); + static void VandermondeHex3D(const int _N, + const memory _r, + const memory _s, + const memory _t, + memory& V); + static void GradVandermondeHex3D(const int _N, + const memory _r, + const memory _s, + const memory _t, + memory& Vr, + memory& Vs, + memory& Vt); + static void MassMatrixHex3D(const int _Np, + const memory V, + memory& _MM); + static void LumpedMassMatrixHex3D(const int _N, + const memory _gllw, + memory& _MM); + static void invLumpedMassMatrixHex3D(const int _N, + const memory _gllw, + memory& _invMM); + static void DmatrixHex3D(const int _N, + const memory _r, + const memory _s, + const memory _t, + memory& _D); + static void InterpolationMatrixHex3D(const int _N, + const memory rIn, + const memory sIn, + const memory tIn, + const memory rOut, + const memory sOut, + const memory tOut, + memory& I); }; +} //namespace libp + #endif diff --git a/include/mesh/mesh2D.hpp b/include/mesh/mesh2D.hpp deleted file mode 100644 index 1ced54f08..000000000 --- a/include/mesh/mesh2D.hpp +++ /dev/null @@ -1,94 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#ifndef MESH2D_HPP -#define MESH2D_HPP 1 - -#include "meshDefines2D.h" - -class mesh2D: public mesh_t { -public: - mesh2D(platform_t& _platform, meshSettings_t& _settings, MPI_Comm _comm); - - // repartition elements in parallel - void GeometricPartition(); - - // serial face-node to face-node connection - void ConnectFaceNodes(); - - // setup occa buffers - virtual void OccaSetup(); - - // print out mesh partition in parallel - void PrintVTU(const char *fileName); -}; - -class meshTri2D: public mesh2D { -public: - meshTri2D(platform_t& _platform, meshSettings_t& _settings, MPI_Comm _comm); - void ParallelReader(const char *fileName); - void SetupBox(); - void SetupPmlBox(); - void ReferenceNodes(int N); - void PhysicalNodes(); - void GeometricFactors(); - void SurfaceGeometricFactors(); - void OccaSetup(); - - void CubatureSetup(); - void CubatureNodes(); - - void MassMatrixKernelSetup(int Nfields); - - void PlotInterp(const dfloat* q, dfloat* Iq, dfloat* scratch=nullptr); - - dfloat ElementCharacteristicLength(dlong e); -}; - -class meshQuad2D: public mesh2D { -public: - meshQuad2D(platform_t& _platform, meshSettings_t& _settings, MPI_Comm _comm); - void ParallelReader(const char *fileName); - void SetupBox(); - void SetupPmlBox(); - void ReferenceNodes(int N); - void PhysicalNodes(); - void GeometricFactors(); - void SurfaceGeometricFactors(); - void OccaSetup(); - - void CubatureSetup(); - void CubatureNodes(); - - void MassMatrixKernelSetup(int Nfields); - - void PlotInterp(const dfloat* q, dfloat* Iq, dfloat* scratch=nullptr); - - dfloat ElementCharacteristicLength(dlong e); -}; - -#endif - diff --git a/include/mesh/mesh3D.hpp b/include/mesh/mesh3D.hpp deleted file mode 100644 index 8eb4a4814..000000000 --- a/include/mesh/mesh3D.hpp +++ /dev/null @@ -1,141 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#ifndef MESH3D_HPP -#define MESH3D_HPP 1 - -#include "meshDefines3D.h" - -class mesh3D: public mesh_t { -public: - mesh3D(platform_t& _platform, meshSettings_t& _settings, MPI_Comm _comm); - - // repartition elements in parallel - void GeometricPartition(); - - // serial face-node to face-node connection - void ConnectFaceNodes(); - - inline - void ConnectFaceModes(int *faceModes, dfloat *V) {}; //not implemented yet - - // setup occa buffers - virtual void OccaSetup(); - - // print out mesh partition in parallel - void PrintVTU(const char *fileName); -}; - -class meshTri3D: public mesh3D { -public: - meshTri3D(platform_t& _platform, meshSettings_t& _settings, MPI_Comm _comm); - void ParallelReader(const char *fileName); - void SetupBox(); - void SetupPmlBox(); - void ReferenceNodes(int N); - void PhysicalNodes(); - void GeometricFactors(); - void SurfaceGeometricFactors(); - void OccaSetup(); - - void CubatureSetup(); - void CubatureNodes(); - - void MassMatrixKernelSetup(int Nfields); - - void PlotInterp(const dfloat* q, dfloat* Iq, dfloat* scratch=nullptr); - - dfloat ElementCharacteristicLength(dlong e); -}; - -class meshQuad3D: public mesh3D { -public: - meshQuad3D(platform_t& _platform, meshSettings_t& _settings, MPI_Comm _comm); - void ParallelReader(const char *fileName); - void SetupBox(); - void SetupPmlBox(); - void ReferenceNodes(int N); - void PhysicalNodes(); - void GeometricFactors(); - void SurfaceGeometricFactors(); - void OccaSetup(); - - void CubatureSetup(); - void CubatureNodes(); - - void MassMatrixKernelSetup(int Nfields); - - void PlotInterp(const dfloat* q, dfloat* Iq, dfloat* scratch=nullptr); - - dfloat ElementCharacteristicLength(dlong e); -}; - -class meshTet3D: public mesh3D { -public: - meshTet3D(platform_t& _platform, meshSettings_t& _settings, MPI_Comm _comm); - void ParallelReader(const char *fileName); - void SetupBox(); - void SetupPmlBox(); - void ReferenceNodes(int N); - void PhysicalNodes(); - void GeometricFactors(); - void SurfaceGeometricFactors(); - void OccaSetup(); - - void CubatureSetup(); - void CubatureNodes(); - - void MassMatrixKernelSetup(int Nfields); - - void PlotInterp(const dfloat* q, dfloat* Iq, dfloat* scratch=nullptr); - - dfloat ElementCharacteristicLength(dlong e); -}; - -class meshHex3D: public mesh3D { -public: - meshHex3D(platform_t& _platform, meshSettings_t& _settings, MPI_Comm _comm); - void ParallelReader(const char *fileName); - void SetupBox(); - void SetupPmlBox(); - void ReferenceNodes(int N); - void PhysicalNodes(); - void GeometricFactors(); - void SurfaceGeometricFactors(); - void OccaSetup(); - - void CubatureSetup(); - void CubatureNodes(); - - void MassMatrixKernelSetup(int Nfields); - - void PlotInterp(const dfloat* q, dfloat* Iq, dfloat* scratch=nullptr); - - dfloat ElementCharacteristicLength(dlong e); -}; - -#endif - diff --git a/include/mesh/meshDefines3D.h b/include/mesh/meshDefines3D.h deleted file mode 100644 index e254b22d3..000000000 --- a/include/mesh/meshDefines3D.h +++ /dev/null @@ -1,99 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#ifndef MESH_DEFINES3D_H -#define MESH_DEFINES3D_H 1 - -/* offsets for geometric factors */ -#define RXID 0 -#define RYID 1 -#define SXID 2 -#define SYID 3 -#define JID 4 -#define JWID 5 -#define IJWID 6 -#define RZID 7 -#define SZID 8 -#define TXID 9 -#define TYID 10 -#define TZID 11 - -/* offsets for second order geometric factors */ -#define G00ID 0 -#define G01ID 1 -#define G11ID 2 -#define GWJID 3 -#define G12ID 4 -#define G02ID 5 -#define G22ID 6 - - -/* offsets for nx, ny, sJ, 1/J */ -#define NXID 0 -#define NYID 1 -#define SJID 2 -#define IJID 3 -#define IHID 4 -#define WSJID 5 -#define WIJID 6 -#define NZID 7 -#define STXID 8 -#define STYID 9 -#define STZID 10 -#define SBXID 11 -#define SBYID 12 -#define SBZID 13 -#define SURXID 14 -#define SURYID 15 -#define SURZID 16 - -// //offsets for boltzmann PML variables -// #define QXID1 0 -// #define QXID2 1 -// #define QXID3 2 -// #define QXID4 3 -// #define QXID5 4 -// #define QXID6 5 -// #define QXID8 6 -// // -// #define QYID1 7 -// #define QYID2 8 -// #define QYID3 9 -// #define QYID4 10 -// #define QYID5 11 -// #define QYID7 12 -// #define QYID9 13 -// // -// #define QZID1 14 -// #define QZID2 15 -// #define QZID3 16 -// #define QZID4 17 -// #define QZID6 18 -// #define QZID7 19 -// #define QZID10 20 - -#endif - diff --git a/include/ogs.hpp b/include/ogs.hpp old mode 100644 new mode 100755 index df81c7c91..e53c7408f --- a/include/ogs.hpp +++ b/include/ogs.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -29,13 +29,15 @@ SOFTWARE. The code - MPI_Comm comm; - dlong N; - hlong id[N]; // the hlong and dlong types are defined in "types.h" - int verbose; - occa::device device + comm_t comm; + dlong N; + memory id(N); // the hlong and dlong types are defined in "types.h" + bool verbose; + bool unique; + ogs_t ogs(platform); ... - ogs_t *ogs = ogs_t::Setup(N, id, comm, verbose, device); + ogs.Setup(N, id, comm, ogs::Signed, + ogs::Auto, unique, verbose); defines a partition of the set of (processor, local index) pairs, (p,i) \in S_j iff abs(id[i]) == j on processor p @@ -43,20 +45,21 @@ SOFTWARE. same id (=j). S_0 is treated specially --- it is ignored completely (i.e., when id[i] == 0, local index i does not participate in any - gather/scatter operation + gather/scatter operation) If id[i] on proc p is negative then the pair (p,i) is "flagged". This determines the non-symmetric behavior. For the simpler, symmetric case, - all id's should be positive. + ogs::Unsigned can be passed to the 'Kind' parameter, which + treats all id's as positive. When "ogs" is no longer needed, free it with - ogsFree(ogs); + ogs.Free(); A basic gatherScatter operation is, e.g., - occa::memory o_v; + deviceMemory o_v; ... - ogs->GatherScatter(o_v, ogs_double, ogs_add, ogs_sym); + ogs.GatherScatter(o_v, 1, ogs::Add, ogs::Sym); This gs call has the effect, @@ -70,92 +73,65 @@ SOFTWARE. Summation on doubles is not the only operation and datatype supported. Support includes the operations - ogs_add, ogs_mul, ogs_max, ogs_min - and datatypes - ogs_dfloat, ogs_double, ogs_float, ogs_int, ogs_longlong, ogs_dlong, ogs_hlong. + ogs::Add, ogs::Mul, ogs::Max, ogs::Min + and datatypes: float, double, int, long long int. - For the nonsymmetric behavior, the "transpose" parameter is important: + For the nonsymmetric behavior, the "Transpose" parameter is important: - ogs->GatherScatter(o_v, ogs_double, ogs_add, [ogs_notrans/ogs_trans/ogs_sym]); + ogs.GatherScatter(o_v, 1, ogs::Add, ogs::[NoTrans/Trans/Sym]); - When transpose == ogs_notrans, any "flagged" (p,i) pairs (id[i] negative on p) + When transpose == ogs::NoTrans, any "flagged" (p,i) pairs (id[i] negative on p) do not participate in the sum, but *do* still receive the sum on output. As a special case, when only one (p,i) pair is unflagged per group this corresponds to the rectangular "Q" matrix referred to above. - When transpose == ogs_trans, the "flagged" (p,i) pairs *do* participate in the sum, + When transpose == ogs::Trans, the "flagged" (p,i) pairs *do* participate in the sum, but do *not* get set on output. In the special case of only one unflagged (p,i) pair, this corresponds to the transpose of "Q" referred to above. - When transpose == ogs_sym, all ids are considered "unflagged". That is, + When transpose == ogs::Sym, all ids are considered "unflagged". That is, the "flagged" (p,i) pairs *do* participate in the sum, and *do* get set on output. - An additional nonsymmetric operation is + When the 'unique' parameter is passed as 'true', the setup call modifies ids, + "flagging" (by negating id[i]) all (p,i) pairs in each group except one. + The sole "unflagged" member of the group is chosen in an arbitrary but + consistent way. When all groups of (p,i) pairs have a single "unflagged" + pair in this mannor, an additional nonsymmetric operation is available: - ogs->Gather(o_Gv, o_v, ogs_double, ogs_add, ogs_notrans); + ogs.Gather(o_Gv, o_v, 1, ogs::Add, ogs::Trans); this has the effect of "assembling" the vector o_Gv. That is o_Gv[gid[j]] <-- \sum_{ (p,j) \in S_{id[i]} } o_v_(p) [j] for some ordering gid. As with the GatherScatter operation, when - transpose == ogs_notrans, any "flagged" (p,i) pairs (id[i] negative on p) - do not participate in the sum, whereas when transpose == ogs_trans the "flagged" - (p,i) pairs *do* participate in the sum. Using transpose == ogs_sym is not - supported (the symmetrized version of this operation is just GatherScatter). + Transpose == ogs::NoTrans, any "flagged" (p,i) pairs (id[i] negative on p) + do not participate in the sum, otherwise the "flagged" (p,i) pairs *do* + participate in the sum. - The reverse of this operation is + The inverse of this operation is - ogs->Scatter(o_v, o_Gv, ogs_double, ogs_add, ogs_notrans); + ogs.Scatter(o_v, o_Gv, 1, ogs::Add, ogs::Trans); which has the effect of scattering in the assembled entries in o_Gv back to the - orginal ordering. When transpose == ogs_notrans, "flagged" (p,i) pairs (id[i] - negative on p) recieve their corresponding entry from o_Gv, and when - transpose == ogs_trans the "flagged" (p,i) pairs do *not* recieve an entry. - Using transpose == ogs_sym is not supported. + orginal ordering. When Transpose == ogs::Trans, "flagged" (p,i) pairs (id[i] + negative on p) do *not* recieve their corresponding entry from o_Gv, otherwise + the "flagged" (p,i) pairs recieve an entry. - A versions for vectors (contiguously packed) is, e.g., + For operating on contiguously packed vectors, the K parameter is used, e.g., - occa::memory o_v; - ogs->GatherScatterVec(o_v, k, ogs_double, ogs_add, ogs_sym); + ogs.GatherScatter(o_v, 3, ogs::Add, ogs::Sym); - which is like "GatherScatter" operating on the datatype double[k], + which is like "GatherScatter" operating on the datatype double[3], with summation here being vector summation. Number of messages sent is independent of k. - For combining the communication for "GatherScatter" on multiple arrays: - - occa::memory o_v1, o_v2, ..., o_vk; - - ogs->GatherScatterMany(o_v, k, stride, ogs_double, op, trans); - - when the arrays o_v1, o_v2, ..., o_vk are packed in o_v as - - o_v1 = o_v + 0*stride, o_v2 = o_v + 1*stride, ... - - This call is equivalent to - - ogs->GatherScatter(o_v1, ogs_double, op, trans); - ogs->GatherScatter(o_v2, ogs_double, op, trans); - ... - ogs->GatherScatter(o_vk, ogs_double, op, trans); - - except that all communication is done together. - - A utility function, ogs_t::Unique is provided - - ogs_t::Unique(ids, N, comm); - - This call modifies ids, "flagging" (by negating id[i]) all (p,i) pairs in - each group except one. The sole "unflagged" member of the group is chosen - in an arbitrary but consistent way. - Asynchronous versions of the various GatherScatter functions are provided by - ogs->GatherScatterStart(o_v, ogs_double, ogs_add, ogs_sym); + ogs.GatherScatterStart(o_v, k, ogs::Add, ogs::Sym); ... - ogs->GatherScatterFinish(o_v, ogs_double, ogs_add, ogs_sym); + ogs.GatherScatterFinish(o_v, k, ogs::Add, ogs::Sym); MPI communication is not initiated in GatherScatterStart, rather some initial message packing and host<->device transfers are queued. The user can then queue @@ -163,260 +139,263 @@ SOFTWARE. calling GatherScatterFinish. The MPI communication will then take place while the user's local kernels execute to maximize the amount of communication hiding. - Finally, a thin wrapper of the ogs_t object, named halo_t is provided. This object - is intended to provided support for thin halo exchages between MPI procceses. + Finally, a specialized communcation object, named halo_t is provided. This + object is analogous to an ogs_t object, where each group S_j has a sole + "unflagged" (p,i) pair, as discussed above regarding the 'unique' parameter, + and furthermore each "unflagged" (p,i) pair has a unique label ids[i] on its + process. That is, for each "unflagged" (p,i), there are no other, flagged or + unflagged, pairs (p,j) on process p with the label ids[i]. -*/ + With this particular flagging of (p,i) pairs, simple exchange routines are + defined: -#ifndef OGS_HPP -#define OGS_HPP + halo_t halo(platofrm); + halo.Setup(N, ids, comm, ogs::Auto, verbose); + halo.Exchange(o_v, k); -#include "core.hpp" -#include "platform.hpp" + which has the effect of filling all "flagged" pairs (p,i) on all processes with + the corresponding value from the unique "unflagged" pair in S_j. -//ogs defs -#include "ogs/ogsDefs.h" + An additional untility operation available in the halo_t object is -/* type enum */ -#define LIST OGS_FOR_EACH_TYPE(ITEM) ogs_type_n -#define ITEM(T) ogs_##T, -typedef enum { LIST } ogs_type; -#undef ITEM -#undef LIST + halo.Combine(o_v, k); -/* operation enum */ -#define LIST OGS_FOR_EACH_OP(T,ITEM) ogs_op_n -#define ITEM(T,op) ogs_##op, -typedef enum { LIST } ogs_op; -#undef ITEM -#undef LIST - -/* transpose switch */ -typedef enum { ogs_sym, ogs_notrans, ogs_trans } ogs_transpose; + which has the effect of summing the entries in S_j and writing the result to + the sole "unflagged" pair in S_j. -class ogsData_t { -public: - dlong Nrows=0; - dlong nnz=0; - dlong NrowBlocks=0; - - dlong *blockRowStarts=nullptr; - dlong *rowStarts=nullptr; - dlong *colIds=nullptr; - - occa::memory o_blockRowStarts; - occa::memory o_rowStarts; - occa::memory o_colIds; - - ogsData_t() {}; - - ~ogsData_t() { - if(blockRowStarts) {free(blockRowStarts); blockRowStarts=nullptr;} - if(rowStarts) {free(rowStarts); rowStarts=nullptr;} - if(colIds) {free(colIds); colIds=nullptr;} - o_blockRowStarts.free(); - o_rowStarts.free(); - o_colIds.free(); - } -}; +*/ -// OCCA+gslib gather scatter -class ogs_t { -public: - platform_t& platform; - MPI_Comm comm; +#ifndef OGS_HPP +#define OGS_HPP - dlong N=0; - dlong Nlocal=0; // number of local nodes - dlong Nhalo=0; // number of halo nodes +#include "core.hpp" +#include "platform.hpp" - dlong Ngather=0; // total number of gather nodes - dlong NgatherHalo=0; // number of halo nodes for gathered vector - hlong NgatherGlobal=0; // global number of gather nodes +namespace libp { - ogsData_t localGather, localScatter; - ogsData_t haloGather, haloScatter; +namespace ogs { - ogsData_t fusedGather, fusedScatter; - ogsData_t symGatherScatter; +/* type enum */ +typedef enum { Float, Double, Int32, Int64} Type; - void *gsh=nullptr; // gslib handle - void *gshSym=nullptr; // Symmetrized gslib handle (all ids made positive) +constexpr Type Dfloat = (std::is_same::value) + ? Double : Float; +// constexpr Type Pfloat = (std::is_same::value) +// ? Double : Float; +constexpr Type Dlong = (std::is_same::value) + ? Int32 : Int64; +constexpr Type Hlong = (std::is_same::value) + ? Int32 : Int64; - void* hostBuf=nullptr; - size_t hostBufSize=0; +/* operation enum */ +typedef enum { Add, Mul, Max, Min} Op; - void* haloBuf=nullptr; - occa::memory o_haloBuf; - occa::memory h_haloBuf; +/* transpose switch */ +typedef enum { Sym, NoTrans, Trans } Transpose; - dlong *GlobalToLocal; - occa::memory o_GlobalToLocal; +/* method switch */ +typedef enum { Auto, Pairwise, CrystalRouter, AllToAll} Method; - ogs_t(platform_t& _platform, MPI_Comm _comm): - platform(_platform), comm(_comm) {}; +/* kind enum */ +typedef enum { Unsigned, Signed, Halo} Kind; - void Free(); +} //namespace ogs - static ogs_t *Setup(dlong N, hlong *ids, MPI_Comm &comm, - int verbose, platform_t& platform); +} //namespace libp - static void Unique(hlong *ids, dlong _N, MPI_Comm _comm); +#include "ogs/ogsBase.hpp" - // Host buffer versions - void GatherScatter (void *v, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void GatherScatterVec (void *v, const int k, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void GatherScatterMany(void *v, const int k, const dlong stride, - const ogs_type type, const ogs_op op, const ogs_transpose trans); +namespace libp { - void Gather (void *gv, void *v, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void GatherVec (void *gv, void *v, const int k, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void GatherMany(void *gv, void *v, const int k, - const dlong gstride, const dlong stride, - const ogs_type type, const ogs_op op, const ogs_transpose trans); +namespace ogs { - void Scatter (void *v, void *gv, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void ScatterVec (void *v, void *gv, const int k, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void ScatterMany(void *v, void *gv, const int k, - const dlong stride, const dlong gstride, - const ogs_type type, const ogs_op op, const ogs_transpose trans); +//pre-build kernels +void InitializeKernels(platform_t& platform, const Type type, const Op op); +// OCCA Gather Scatter +class ogs_t : public ogsBase_t { +public: + ogs_t()=default; + ~ogs_t()=default; + + void Setup(const dlong _N, + memory ids, + comm_t _comm, + const Kind _kind, + const Method method, + const bool _unique, + const bool verbose, + platform_t& _platform); + + void SetupGlobalToLocalMapping(memory GlobalToLocal); + + // Synchronous host versions + template + void GatherScatter(memory v, + const int k, + const Op op, + const Transpose trans); + // Asynchronous host buffer versions + template + void GatherScatterStart (memory v, + const int k, + const Op op, + const Transpose trans); + template + void GatherScatterFinish(memory v, + const int k, + const Op op, + const Transpose trans); // Synchronous device buffer versions - void GatherScatter (occa::memory& o_v, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void GatherScatterVec (occa::memory& o_v, const int k, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void GatherScatterMany(occa::memory& o_v, const int k, - const dlong stride, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - - void Gather (occa::memory& o_gv, occa::memory& o_v, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void GatherVec (occa::memory& o_gv, occa::memory& o_v, const int k, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void GatherMany(occa::memory& o_gv, occa::memory& o_v, const int k, - const dlong gstride, const dlong stride, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - - void Scatter (occa::memory& o_v, occa::memory& o_gv, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void ScatterVec (occa::memory& o_v, occa::memory& o_gv, const int k, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void ScatterMany(occa::memory& o_v, occa::memory& o_gv, const int k, - const dlong stride, const dlong gstride, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - + template + void GatherScatter(deviceMemory o_v, + const int k, + const Op op, + const Transpose trans); // Asynchronous device buffer versions - void GatherScatterStart (occa::memory& o_v, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void GatherScatterFinish (occa::memory& o_v, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void GatherScatterVecStart (occa::memory& o_v, const int k, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void GatherScatterVecFinish (occa::memory& o_v, const int k, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void GatherScatterManyStart (occa::memory& o_v, const int k, const dlong stride, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void GatherScatterManyFinish(occa::memory& o_v, const int k, const dlong stride, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - - void GatherStart (occa::memory& o_gv, occa::memory& o_v, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void GatherFinish (occa::memory& o_gv, occa::memory& o_v, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void GatherVecStart (occa::memory& o_gv, occa::memory& o_v, const int k, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void GatherVecFinish (occa::memory& o_gv, occa::memory& o_v, const int k, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void GatherManyStart (occa::memory& o_gv, occa::memory& o_v, const int k, - const dlong gstride, const dlong stride, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void GatherManyFinish(occa::memory& o_gv, occa::memory& o_v, const int k, - const dlong gstride, const dlong stride, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - - void ScatterStart (occa::memory& o_v, occa::memory& o_gv, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void ScatterFinish (occa::memory& o_v, occa::memory& o_gv, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void ScatterVecStart (occa::memory& o_v, occa::memory& o_gv, const int k, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void ScatterVecFinish (occa::memory& o_v, occa::memory& o_gv, const int k, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void ScatterManyStart (occa::memory& o_v, occa::memory& o_gv, const int k, - const dlong stride, const dlong gstride, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - void ScatterManyFinish(occa::memory& o_v, occa::memory& o_gv, const int k, - const dlong stride, const dlong gstride, - const ogs_type type, const ogs_op op, const ogs_transpose trans); - - void GatheredHaloExchangeSetup(); - void GatheredHaloExchangeStart(occa::memory& o_v, - const int k, - const ogs_type type); - void GatheredHaloExchangeFinish(occa::memory& o_v, - const int k, - const ogs_type type); - - void reallocHostBuffer(size_t Nbytes); - void reallocOccaBuffer(size_t Nbytes); + template + void GatherScatterStart (deviceMemory o_v, + const int k, + const Op op, + const Transpose trans); + template + void GatherScatterFinish(deviceMemory o_v, + const int k, + const Op op, + const Transpose trans); + + // Synchronous host versions + template + void Gather(memory gv, + const memory v, + const int k, + const Op op, + const Transpose trans); + // Asynchronous host buffer versions + template + void GatherStart (memory gv, + const memory v, + const int k, + const Op op, + const Transpose trans); + template + void GatherFinish(memory gv, + const memory v, + const int k, + const Op op, + const Transpose trans); + // Synchronous device buffer versions + template + void Gather(deviceMemory o_gv, + deviceMemory o_v, + const int k, + const Op op, + const Transpose trans); + // Asynchronous device buffer versions + template + void GatherStart (deviceMemory o_gv, + deviceMemory o_v, + const int k, + const Op op, + const Transpose trans); + template + void GatherFinish(deviceMemory o_gv, + deviceMemory o_v, + const int k, + const Op op, + const Transpose trans); + + // Synchronous host versions + template + void Scatter(memory v, + const memory gv, + const int k, + const Transpose trans); + // Asynchronous host buffer versions + template + void ScatterStart (memory v, + const memory gv, + const int k, + const Transpose trans); + template + void ScatterFinish(memory v, + memory gv, + const int k, + const Transpose trans); + // Synchronous device buffer versions + template + void Scatter(deviceMemory o_v, + deviceMemory o_gv, + const int k, + const Transpose trans); + // Asynchronous device buffer versions + template + void ScatterStart (deviceMemory o_v, + deviceMemory o_gv, + const int k, + const Transpose trans); + template + void ScatterFinish(deviceMemory o_v, + deviceMemory o_gv, + const int k, + const Transpose trans); + + friend class halo_t; }; -// OCCA halo exchange (thin wrapper of an ogs_t object) -class halo_t { +// OCCA Halo +class halo_t : public ogsBase_t { public: - ogs_t* ogs; - - void Free() { if (ogs) { ogs->Free(); ogs=nullptr; } } - - static halo_t *Setup(dlong N, hlong *ids, MPI_Comm &comm, - int verbose, platform_t& platform) { - halo_t *halo = new halo_t(); - halo->ogs = ogs_t::Setup(N, ids, comm, verbose, platform); - return halo; - } - - // Synchronous Host buffer version - void Exchange(void *v, const int k, const ogs_type type) { - ogs->GatherScatterVec(v, k, type, ogs_add, ogs_notrans); - } - + halo_t()=default; + ~halo_t()=default; + + bool gathered_halo=false; + dlong Nhalo=0; + + void Setup(const dlong _N, + memory ids, + comm_t _comm, + const Method method, + const bool verbose, + platform_t& _platform); + + void SetupFromGather(ogs_t& ogs); + + // Synchronous Host version + template + void Exchange(memory v, const int k); + // Asynchronous host version + template + void ExchangeStart (memory v, const int k); + template + void ExchangeFinish(memory v, const int k); // Synchronous device buffer version - void Exchange(occa::memory &o_v, const int k, const ogs_type type) { - ogs->GatherScatterVec(o_v, k, type, ogs_add, ogs_notrans); - } - + template + void Exchange(deviceMemory o_v, const int k); // Asynchronous device buffer version - void ExchangeStart (occa::memory &o_v, const int k, const ogs_type type) { - ogs->GatherScatterVecStart(o_v, k, type, ogs_add, ogs_notrans); - } - void ExchangeFinish(occa::memory &o_v, const int k, const ogs_type type) { - ogs->GatherScatterVecFinish(o_v, k, type, ogs_add, ogs_notrans); - } - - // Synchronous Host buffer version - void Combine(void *v, const int k, const ogs_type type) { - ogs->GatherScatterVec(v, k, type, ogs_add, ogs_sym); - } - + template + void ExchangeStart (deviceMemory o_v, const int k); + template + void ExchangeFinish(deviceMemory o_v, const int k); + + // Synchronous Host version + template + void Combine(memory v, const int k); + // Asynchronous host version + template + void CombineStart (memory v, const int k); + template + void CombineFinish(memory v, const int k); // Synchronous device buffer version - void Combine(occa::memory &o_v, const int k, const ogs_type type) { - ogs->GatherScatterVec(o_v, k, type, ogs_add, ogs_sym); - } - + template + void Combine(deviceMemory o_v, const int k); // Asynchronous device buffer version - void CombineStart (occa::memory &o_v, const int k, const ogs_type type) { - ogs->GatherScatterVecStart(o_v, k, type, ogs_add, ogs_sym); - } - void CombineFinish(occa::memory &o_v, const int k, const ogs_type type) { - ogs->GatherScatterVecFinish(o_v, k, type, ogs_add, ogs_sym); - } + template + void CombineStart (deviceMemory o_v, const int k); + template + void CombineFinish(deviceMemory o_v, const int k); }; +} //namespace ogs +} //namespace libp #endif diff --git a/include/ogs/ogsBase.hpp b/include/ogs/ogsBase.hpp new file mode 100644 index 000000000..a0e0355ab --- /dev/null +++ b/include/ogs/ogsBase.hpp @@ -0,0 +1,112 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#ifndef OGSBASE_HPP +#define OGSBASE_HPP + +#include "ogs.hpp" + +namespace libp { + +namespace ogs { + +//forward declarations +class ogsOperator_t; +class ogsFusedOperator_t; +class ogsExchange_t; + +struct parallelNode_t; + +class halo_t; + +class ogsBase_t { +public: + platform_t platform; + comm_t comm; + + dlong N=0; + dlong Ngather=0; // total number of local positive gather nodes + + dlong NlocalT=0; // number of local gather nodes + dlong NhaloT=0; // number of halo gather nodes + dlong NlocalP=0; // number of positive local gather nodes + dlong NhaloP=0; // number of positive halo gather nodes + + hlong NgatherGlobal=0; // global number of positive gather nodes + + Kind kind; + bool unique=false; + bool gather_defined=false; + + static stream_t dataStream; + + ogsBase_t()=default; + virtual ~ogsBase_t()=default; + + virtual void Setup(const dlong _N, + memory ids, + comm_t _comm, + const Kind _kind, + const Method method, + const bool _unique, + const bool verbose, + platform_t& _platform); + void Free(); + +protected: + std::shared_ptr gatherLocal; + std::shared_ptr gatherHalo; + std::shared_ptr exchange; + + void AssertGatherDefined(); + +private: + void FindSharedNodes(const dlong Nids, + memory &nodes, + const int verbose); + + void ConstructSharedNodes(const dlong Nids, + memory &nodes, + dlong &Nshared, + memory &sharedNodes); + + void LocalSignedSetup(const dlong Nids, memory &nodes); + void LocalUnsignedSetup(const dlong Nids, memory &nodes); + void LocalHaloSetup(const dlong Nids, memory &nodes); + + ogsExchange_t* AutoSetup(dlong Nshared, + memory &sharedNodes, + ogsOperator_t& gatherHalo, + comm_t _comm, + platform_t &_platform, + const int verbose); +}; + +} //namespace ogs + +} //namespace libp + +#endif diff --git a/include/ogs/ogsDefs.h b/include/ogs/ogsDefs.h deleted file mode 100644 index 01e63ba46..000000000 --- a/include/ogs/ogsDefs.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -/* the supported types */ -typedef long long long_long; -#define OGS_FOR_EACH_TYPE(macro) \ - macro(double ) \ - macro(float ) \ - macro(int ) \ - macro(long ) \ - macro(long_long) - -/* the supported ops */ -#define OGS_FOR_EACH_OP(T,macro) \ - macro(T,add) \ - macro(T,mul) \ - macro(T,min) \ - macro(T,max) - -#define OGS_DO_add(a,b) a+=b -#define OGS_DO_mul(a,b) a*=b -#define OGS_DO_min(a,b) if(ba) a=b - -/* type size array */ -#define OGS_TYPE_SIZE_ITEM(T) sizeof(T), -#define OGS_DEFINE_TYPE_SIZES() \ - static const unsigned ogs_type_size[] = \ - { OGS_FOR_EACH_TYPE(OGS_TYPE_SIZE_ITEM) 0 }; - -/* mapping from ogs types to gs types */ -#define gs_int64_t gs_long_long -#define OGS_GS_MAP_TYPE_ITEM(T) gs_##T, -#define OGS_GS_DEFINE_TYPE_MAP() \ - static const gs_dom ogs_gs_type_map[] = \ - { OGS_FOR_EACH_TYPE(OGS_GS_MAP_TYPE_ITEM) gs_dom_n }; - -/* mapping from ogs ops to gs ops */ -#define OGS_GS_MAP_OP_ITEM(T,OP) gs_##OP, -#define OGS_GS_DEFINE_OP_MAP() \ - static const gs_op ogs_gs_op_map[] = \ - { OGS_FOR_EACH_OP(T,OGS_GS_MAP_OP_ITEM) gs_op_n }; diff --git a/include/ogs/ogsExchange.hpp b/include/ogs/ogsExchange.hpp new file mode 100644 index 000000000..40f6f6f38 --- /dev/null +++ b/include/ogs/ogsExchange.hpp @@ -0,0 +1,336 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#ifndef OGS_EXCHANGE_HPP +#define OGS_EXCHANGE_HPP + +#include "ogs.hpp" +#include "ogs/ogsOperator.hpp" + +namespace libp { + +namespace ogs { + +//virtual base class to perform MPI exchange of gatherScatter +class ogsExchange_t { +public: + platform_t platform; + comm_t comm; + int rank, size; + + dlong Nhalo, NhaloP; + + pinnedMemory h_workspace, h_sendspace; + deviceMemory o_workspace, o_sendspace; + + stream_t dataStream; + static kernel_t extractKernel[4]; + +#ifdef GPU_AWARE_MPI + bool gpu_aware=true; +#else + bool gpu_aware=false; +#endif + + ogsExchange_t(platform_t &_platform, comm_t _comm, + stream_t _datastream): + platform(_platform), + comm(_comm), + dataStream(_datastream) { + rank = comm.rank(); + size = comm.size(); + } + virtual ~ogsExchange_t() {} + + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans)=0; + + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans)=0; + + virtual void AllocBuffer(size_t Nbytes)=0; + + friend void InitializeKernels(platform_t& platform, const Type type, const Op op); +}; + +//MPI communcation via single MPI_Alltoallv call +class ogsAllToAll_t: public ogsExchange_t { +private: + + dlong NsendN=0, NsendT=0; + memory sendIdsN, sendIdsT; + deviceMemory o_sendIdsN, o_sendIdsT; + + ogsOperator_t postmpi; + + memory mpiSendCountsN; + memory mpiSendCountsT; + memory mpiRecvCountsN; + memory mpiRecvCountsT; + memory mpiSendOffsetsN; + memory mpiSendOffsetsT; + memory mpiRecvOffsetsN; + memory mpiRecvOffsetsT; + + memory sendCounts; + memory recvCounts; + memory sendOffsets; + memory recvOffsets; + + Comm::request_t request; + +public: + ogsAllToAll_t(dlong Nshared, + memory &sharedNodes, + ogsOperator_t &gatherHalo, + stream_t _dataStream, + comm_t _comm, + platform_t &_platform); + + template + void Start(pinnedMemory &buf, + const int k, + const Op op, + const Transpose trans); + + template + void Finish(pinnedMemory &buf, + const int k, + const Op op, + const Transpose trans); + + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + + template + void Start(deviceMemory &buf, + const int k, + const Op op, + const Transpose trans); + + template + void Finish(deviceMemory &buf, + const int k, + const Op op, + const Transpose trans); + + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans); + + virtual void AllocBuffer(size_t Nbytes); + +}; + +//MPI communcation via pairwise send/recvs +class ogsPairwise_t: public ogsExchange_t { +private: + + dlong NsendN=0, NsendT=0; + memory sendIdsN, sendIdsT; + deviceMemory o_sendIdsN, o_sendIdsT; + + ogsOperator_t postmpi; + + int NranksSendN=0, NranksRecvN=0; + int NranksSendT=0, NranksRecvT=0; + memory sendRanksN; + memory sendRanksT; + memory recvRanksN; + memory recvRanksT; + memory sendCountsN; + memory sendCountsT; + memory recvCountsN; + memory recvCountsT; + memory sendOffsetsN; + memory sendOffsetsT; + memory recvOffsetsN; + memory recvOffsetsT; + memory requests; + +public: + ogsPairwise_t(dlong Nshared, + memory &sharedNodes, + ogsOperator_t &gatherHalo, + stream_t _dataStream, + comm_t _comm, + platform_t &_platform); + + template + void Start(pinnedMemory &buf, + const int k, + const Op op, + const Transpose trans); + + template + void Finish(pinnedMemory &buf, + const int k, + const Op op, + const Transpose trans); + + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + + template + void Start(deviceMemory &buf, + const int k, + const Op op, + const Transpose trans); + + template + void Finish(deviceMemory &buf, + const int k, + const Op op, + const Transpose trans); + + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans); + + virtual void AllocBuffer(size_t Nbytes); +}; + +//MPI communcation via Crystal Router +class ogsCrystalRouter_t: public ogsExchange_t { +private: + + struct crLevel { + int Nmsg; + int partner; + + int Nsend, Nrecv0, Nrecv1; + dlong recvOffset; + + memory sendIds; + deviceMemory o_sendIds; + + ogsOperator_t gather; + }; + + int buf_id=0, hbuf_id=0; + pinnedMemory h_work[2]; + deviceMemory o_work[2]; + + memory request; + + int Nlevels=0; + memory levelsN; + memory levelsT; + + int NsendMax=0, NrecvMax=0; + +public: + ogsCrystalRouter_t(dlong Nshared, + memory &sharedNodes, + ogsOperator_t &gatherHalo, + stream_t _dataStream, + comm_t _comm, + platform_t &_platform); + + template + void Start(pinnedMemory &buf, + const int k, + const Op op, + const Transpose trans); + + template + void Finish(pinnedMemory &buf, + const int k, + const Op op, + const Transpose trans); + + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + + template + void Start(deviceMemory &buf, + const int k, + const Op op, + const Transpose trans); + + template + void Finish(deviceMemory &buf, + const int k, + const Op op, + const Transpose trans); + + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans); + + virtual void AllocBuffer(size_t Nbytes); +}; + +} //namespace ogs + +} //namespace libp + +#endif diff --git a/include/ogs/ogsKernels.hpp b/include/ogs/ogsKernels.hpp deleted file mode 100644 index cfab2f207..000000000 --- a/include/ogs/ogsKernels.hpp +++ /dev/null @@ -1,203 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#ifndef OGS_KERNELS_HPP -#define OGS_KERNELS_HPP - -#include -#include "ogs.hpp" -#include "ogsDefs.h" - -#define DEFINE_ADD_OGS_INIT(T) \ - static T init_##T##_add = (T) 0; \ - static T init_##T##_mul = (T) 1; \ - static T init_##T##_min = (T) std::numeric_limits::max(); \ - static T init_##T##_max = (T) -std::numeric_limits::max(); - -class ogsData_t; - -namespace ogs { - -extern const int blockSize; -extern const int gatherNodesPerBlock; - -extern int Nrefs; - -extern occa::stream dataStream; - -void initKernels(platform_t& platform); - -void freeKernels(); - -//Setup a gslib struct -void *gsSetup(MPI_Comm meshComm, - dlong NuniqueBases, - hlong *gatherGlobalNodes, - int nonsymm, int verbose); - -void gsUnique(hlong *gatherGlobalNodes, - dlong NuniqueBases, - MPI_Comm meshComm); - -void gsFree(void* gs); - -#define DEFINE_GATHERSCATTER_KERNEL(T,OP) \ - extern occa::kernel gatherScatterKernel_##T##_##OP; - -#define DEFINE_GATHER_KERNEL(T,OP) \ - extern occa::kernel gatherKernel_##T##_##OP; - -#define DEFINE_SCATTER_KERNEL(T) \ - extern occa::kernel scatterKernel_##T; - -#define DEFINE_KERNELS(T) \ - OGS_FOR_EACH_OP(T,DEFINE_GATHERSCATTER_KERNEL) \ - OGS_FOR_EACH_OP(T,DEFINE_GATHER_KERNEL) \ - DEFINE_SCATTER_KERNEL(T) - -OGS_FOR_EACH_TYPE(DEFINE_KERNELS) - -#undef DEFINE_GATHERSCATTER_KERNEL -#undef DEFINE_GATHER_KERNEL -#undef DEFINE_SCATTER_KERNEL -#undef DEFINE_KERNELS - -void occaGatherScatterStart(occa::memory& o_v, - const int Nentries, const int Nvectors, const dlong stride, - const ogs_type type, const ogs_op op, - const ogs_transpose trans, ogs_t &ogs); -void occaGatherScatterFinish(occa::memory& o_v, - const int Nentries, const int Nvectors, const dlong stride, - const ogs_type type, const ogs_op op, - const ogs_transpose trans, ogs_t &ogs); - -void occaGatherStart(occa::memory& o_gv, occa::memory& o_v, - const int Nentries, const int Nvectors, - const dlong gstride, const dlong stride, - const ogs_type type, const ogs_op op, - const ogs_transpose trans, ogs_t &ogs); -void occaGatherFinish(occa::memory& o_gv, occa::memory& o_v, - const int Nentries, const int Nvectors, - const dlong gstride, const dlong stride, - const ogs_type type, const ogs_op op, - const ogs_transpose trans, ogs_t &ogs); - -void occaScatterStart(occa::memory& o_v, occa::memory& o_gv, - const int Nentries, const int Nvectors, - const dlong stride, const dlong gstride, - const ogs_type type, const ogs_op op, - const ogs_transpose trans, ogs_t &ogs); -void occaScatterFinish(occa::memory& o_v, occa::memory& o_gv, - const int Nentries, const int Nvectors, - const dlong stride, const dlong gstride, - const ogs_type type, const ogs_op op, - const ogs_transpose trans, ogs_t &ogs); - -void hostGatherScatter(void* v, const int Nentries, const int Nvectors, - const dlong stride, const ogs_type type, - const ogs_op op, const ogs_transpose trans, ogs_t &ogs); - -void hostGather(void* gv, void* v, const int Nentries, const int Nvectors, - const dlong gstride, const dlong stride, - const ogs_type type, const ogs_op op, - const ogs_transpose trans, ogs_t &ogs); - -void hostScatter(void* v, void* gv, const int Nentries, const int Nvectors, - const dlong stride, const dlong gstride, - const ogs_type type, const ogs_op op, - const ogs_transpose trans, ogs_t &ogs); - -void occaGatherScatterKernel(const ogsData_t &gather, - const ogsData_t &scatter, - const int Nentries, - const int Nvectors, - const dlong stride, - const ogs_type type, - const ogs_op op, - occa::memory& o_v); - -void occaGatherKernel(const ogsData_t &gather, - const int Nentries, - const int Nvectors, - const dlong stride, - const dlong gtride, - const ogs_type type, - const ogs_op op, - occa::memory& o_v, - occa::memory& o_gv); - -void occaScatterKernel(const ogsData_t &gather, - const int Nentries, - const int Nvectors, - const dlong gtride, - const dlong stride, - const ogs_type type, - const ogs_op op, - occa::memory& o_gv, - occa::memory& o_v); - -void hostGatherScatterKernel(const dlong N, - const int Nentries, - const int Nvectors, - const dlong stride, - dlong* gatherStarts, - dlong* gatherIds, - dlong* scatterStarts, - dlong* scatterIds, - const ogs_type type, - const ogs_op op, - void* v); - -void hostGatherKernel(const dlong N, - const int Nentries, - const int Nvectors, - const dlong stride, - const dlong gstride, - const dlong *gatherStarts, - const dlong *gatherIds, - const ogs_type type, - const ogs_op op, - const void *v, - void *gv); - -void hostScatterKernel(const dlong N, - const int Nentries, - const int Nvectors, - const dlong gstride, - const dlong stride, - const dlong *scatterStarts, - const dlong *scatterIds, - const ogs_type type, - const ogs_op op, - const void *gv, - void *v); - -void gsGatherScatter(void* v, const int Nentries, const int Nvectors, - const dlong stride, const ogs_type type, const ogs_op op, - const ogs_transpose trans, void * gsh); -} - -#endif diff --git a/include/ogs/ogsOperator.hpp b/include/ogs/ogsOperator.hpp new file mode 100644 index 000000000..dffafc79c --- /dev/null +++ b/include/ogs/ogsOperator.hpp @@ -0,0 +1,147 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#ifndef OGS_OPERATOR_HPP +#define OGS_OPERATOR_HPP + +#include "ogs.hpp" + +namespace libp { + +namespace ogs { + +// The Z operator class is essentially a sparse CSR matrix, +// with no vals stored. By construction, the sparse +// matrix will have at most 1 non-zero per column. +class ogsOperator_t { +public: + platform_t platform; + + dlong Ncols=0; + dlong NrowsN=0; + dlong NrowsT=0; + dlong nnzN=0; + dlong nnzT=0; + + memory rowStartsN; + memory rowStartsT; + memory colIdsN; + memory colIdsT; + + deviceMemory o_rowStartsN; + deviceMemory o_rowStartsT; + deviceMemory o_colIdsN; + deviceMemory o_colIdsT; + + dlong NrowBlocksN=0; + dlong NrowBlocksT=0; + memory blockRowStartsN; + memory blockRowStartsT; + deviceMemory o_blockRowStartsN; + deviceMemory o_blockRowStartsT; + + Kind kind; + + ogsOperator_t()=default; + ogsOperator_t(platform_t& _platform) + : platform(_platform) {}; + + void Free(); + + void setupRowBlocks(); + + //Apply Z operator + template class U, + template class V, + typename T> + void Gather(U gv, const V v, + const int k, const Op op, const Transpose trans); + + template + void Gather(deviceMemory gv, const deviceMemory v, + const int k, const Op op, const Transpose trans); + + //Apply Z^T transpose operator + template class U, + template class V, + typename T> + void Scatter(U v, const V gv, + const int k, const Transpose trans); + + template + void Scatter(deviceMemory v, const deviceMemory gv, + const int k, const Transpose trans); + + //Apply Z^T*Z operator + template class U, + typename T> + void GatherScatter(U v, const int k, + const Op op, const Transpose trans); + + template + void GatherScatter(deviceMemory v, const int k, + const Op op, const Transpose trans); + +private: + template class U, + template class V, + template class Op, + typename T> + void Gather(U gv, const V v, + const int K, const Transpose trans); + template class U, + template class Op, + typename T> + void GatherScatter(U v, const int K, + const Transpose trans); + + //NC: Hard code these for now. Should be sufficient for GPU devices, but needs attention for CPU + static constexpr int blockSize = 256; + static constexpr int gatherNodesPerBlock = 512; //should be a multiple of blockSize for good unrolling + + //4 types - Float, Double, Int32, Int64 + //4 ops - Add, Mul, Max, Min + static kernel_t gatherScatterKernel[4][4]; + static kernel_t gatherKernel[4][4]; + static kernel_t scatterKernel[4]; + + friend void InitializeKernels(platform_t& platform, const Type type, const Op op); +}; + +template class U, + template class V, + typename T> +void extract(const dlong N, + const int K, + const memory ids, + const U q, + V gatherq); + +} //namespace ogs + +} //namespace libp + +#endif diff --git a/include/ogs/ogsUtils.hpp b/include/ogs/ogsUtils.hpp new file mode 100644 index 000000000..1e11023ca --- /dev/null +++ b/include/ogs/ogsUtils.hpp @@ -0,0 +1,87 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#ifndef OGS_UTILS_HPP +#define OGS_UTILS_HPP + +#include "ogs.hpp" + +namespace libp { + +namespace ogs { + +struct parallelNode_t{ + + dlong localId; // local node id + hlong baseId; // original global index + + dlong newId; // new global id + int sign; + + int rank; //original rank + int destRank; //destination rank + +}; + +template +struct ogsType { + static constexpr Type get(); +}; + +template<> struct ogsType { + static constexpr Type get() { return Float; } +}; +template<> struct ogsType { + static constexpr Type get() { return Double; } +}; +template<> struct ogsType { + static constexpr Type get() { return Int32; } +}; +template<> struct ogsType { + static constexpr Type get() { return Int64; } +}; + +//permute an array A, according to the ordering returned by P +// i.e. for all n, A[P(n)] <- A[n] +template +void permute(const dlong N, memory A, Order P) { + + for(dlong n=0;n &o_r, deviceMemory &o_Mr) { + LIBP_FORCE_ABORT("Operator not implemented in this object"); + }; +}; + +} //namespace libp + +#endif diff --git a/libs/mesh/meshPartitionStatistics.cpp b/include/parAdogs.hpp similarity index 52% rename from libs/mesh/meshPartitionStatistics.cpp rename to include/parAdogs.hpp index 8d236ef6b..b4868eda6 100644 --- a/libs/mesh/meshPartitionStatistics.cpp +++ b/include/parAdogs.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -24,41 +24,39 @@ SOFTWARE. */ -#include "mesh.hpp" - -void mesh_t::PrintPartitionStatistics(){ - - /* now gather statistics on connectivity between processes */ - int *comms = (int*) calloc(size, sizeof(int)); - int Ncomms = 0; - - /* count elements with neighbors on each other rank ranks */ - for(dlong e=0;e0) - ++Nmessages; - - for(int rr=0;rr& faceVertices, + memory& EToV, + memory& EToE, + memory& EToF, + memory& EX, + memory& EY, + memory& EZ, + comm_t comm); + +} //namespace paradogs + +} //namespace libp + +#endif + diff --git a/include/parAdogs/parAdogsGraph.hpp b/include/parAdogs/parAdogsGraph.hpp new file mode 100644 index 000000000..45d73cd15 --- /dev/null +++ b/include/parAdogs/parAdogsGraph.hpp @@ -0,0 +1,159 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#ifndef PARADOGS_GRAPH_HPP +#define PARADOGS_GRAPH_HPP 1 + +#include "parAdogs.hpp" +#include "parAdogs/parAdogsMatrix.hpp" +#include "parAdogs/parAdogsMultigrid.hpp" + +namespace libp { + +namespace paradogs { + +class graph_t { +public: + /*Mesh data*/ + static constexpr int MAX_NVERTS=8; + static constexpr int MAX_NFACES=6; + static constexpr int MAX_NFACEVERTS=4; + +private: + platform_t platform; + + comm_t gcomm; + comm_t comm; + + int rank, size; + dlong Nverts=0, Nhalo=0; + hlong NVertsGlobal=0; + hlong VoffsetL=0, VoffsetU=0; + + int grank, gsize; + hlong gNVertsGlobal=0; + hlong gVoffsetL=0, gVoffsetU=0; + + + dlong Nelements=0; + int dim=0; + int Nfaces=0; + int NelementVerts=0; + int NfaceVerts=0; + struct element_t { + dfloat EX[MAX_NVERTS]; //x coordinates of verts + dfloat EY[MAX_NVERTS]; //y coordinates of verts + dfloat EZ[MAX_NVERTS]; //z coordinates of verts + hlong V[MAX_NVERTS]; //Global Vertex Ids of verts + + hlong E[MAX_NFACES]; //Global element ids of neighbors + int F[MAX_NFACES]; //Face ids of neighbors + }; + memory elements; + + int faceVerts[MAX_NFACES*MAX_NFACEVERTS]; + + /*Multilevel Laplacian (for spectral partitioning)*/ + static constexpr int MAX_LEVELS=100; + int Nlevels=0; + mgLevel_t L[MAX_LEVELS]; + coarseSolver_t coarseSolver; + + memory colIds; + +public: + /*Build a graph from mesh connectivity info*/ + graph_t(platform_t &_platform, + const dlong _Nelements, + const int _dim, + const int _Nverts, + const int _Nfaces, + const int _NfaceVerts, + const memory& faceVertices, + const memory& EToV, + const memory& EX, + const memory& EY, + const memory& EZ, + comm_t _comm); + + void InertialPartition(); + + void SpectralPartition(); + + void Connect(); + + void CuthillMckee(); + + void Report(); + + void ExtractMesh(dlong &Nelements_, + memory& EToV, + memory& EToE, + memory& EToF, + memory& EX, + memory& EY, + memory& EZ); + +private: + void InertialBipartition(const dfloat targetFraction[2]); + void SpectralBipartition(const dfloat targetFraction[2]); + + + /*Divide graph into two pieces according to a bisection*/ + void Split(const memory& partition); + + void CreateLaplacian(); + + /*Compute Fiedler vector of graph */ + memory& FiedlerVector(); + + /*Improve a Fiedler vector*/ + void Refine(const int level); + + /* Solve A_{l}*x = b*/ + int Solve(const int level, + const dfloat TOL, + memory& r, + memory& x, + memory& scratch); + + /*Create multilevel heirarchy*/ + void MultigridSetup(); + + void MultigridVcycle(const int l, + memory& r, + memory& x); + + /*Clear multilevel heirarchy*/ + void MultigridDestroy(); +}; + +} //namespace paradogs + +} //namespace libp + +#endif + diff --git a/include/parAdogs/parAdogsMatrix.hpp b/include/parAdogs/parAdogsMatrix.hpp new file mode 100644 index 000000000..cc9062f0f --- /dev/null +++ b/include/parAdogs/parAdogsMatrix.hpp @@ -0,0 +1,128 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#ifndef PARADOGS_MATRIX_HPP +#define PARADOGS_MATRIX_HPP 1 + +#include "parAdogs.hpp" +#include "ogs.hpp" + +namespace libp { + +namespace paradogs { + +struct nonZero_t { + hlong row; + hlong col; + dfloat val; +}; + + +class parCSR { +public: + platform_t platform; + comm_t comm; + + dlong Nrows=0; + dlong Ncols=0; + + //partition info + hlong rowOffsetL=0, rowOffsetU=0; + hlong colOffsetL=0, colOffsetU=0; + + //local sparse matrix + struct CSR { + dlong nnz=0; + memory rowStarts; + memory cols; + memory vals; + }; + CSR diag; + + //non-local sparse matrix + struct MCSR { + dlong nnz=0; + dlong nzRows=0; + + memory rowStarts; + memory mRowStarts; + memory rows; + memory cols; + memory vals; + }; + MCSR offd; + + memory diagA; + memory diagInv; + + /*communcation info*/ + dlong NlocalCols = 0; + ogs::halo_t halo; + memory colMap; + + //rho ~= cond(invD * A) + dfloat rho=0.0; + + parCSR()=default; + parCSR(dlong N, dlong M, platform_t& _platform, comm_t _comm): + platform(_platform), comm(_comm), Nrows(N), Ncols(M) {} + + //build a parCSR matrix from a distributed COO matrix + parCSR(dlong _Nrows, dlong _Ncols, + const dlong NNZ, + memory& entries, + const platform_t &_platform, + comm_t comm); + + void haloSetup(memory& colIds); + + // estimate rho(invD * A) + dfloat rhoDinvA(memory& null); + + /*Aggregate via distance-2 PMIS*/ + void Aggregate(dlong& cNverts, + const dfloat theta, + memory& FineToCoarse); + + void GalerkinProduct(const parCSR &A, const parCSR &P); + + void SpMV(const dfloat alpha, memory& x, + const dfloat beta, memory& y); + void SpMV(const dfloat alpha, memory& x, + const dfloat beta, const memory& y, memory& z); + + void SmoothChebyshev(memory& b, memory& x, + const dfloat lambda0, const dfloat lambda1, + const bool xIsZero, memory& scratch, + const int ChebyshevIterations); +}; + +} //namespace paradogs + +} //namespace libp + +#endif + diff --git a/include/parAdogs/parAdogsMultigrid.hpp b/include/parAdogs/parAdogsMultigrid.hpp new file mode 100644 index 000000000..086498f13 --- /dev/null +++ b/include/parAdogs/parAdogsMultigrid.hpp @@ -0,0 +1,121 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#ifndef PARADOGS_MULTIGRID_HPP +#define PARADOGS_MULTIGRID_HPP 1 + +#include "parAdogs.hpp" +#include "parAdogs/parAdogsMatrix.hpp" + +namespace libp { + +namespace paradogs { + +class mgLevel_t { +public: + dlong Nrows=0, Ncols=0; + hlong Nglobal=0; + + parCSR A, P, R; + + /*null vector*/ + memory null; + + /*Fiedler vector*/ + memory Fiedler; + + /*Vcycle storage*/ + memory RHS; + memory X; + memory RES; + memory scratch; + + dfloat lambda1, lambda0; //smoothing params + + /*Create graph Laplacian*/ + void CreateLaplacian(const dlong Nelements, + const int Nfaces, + const memory& EToE, + comm_t comm); + + /*Construct a coarse level*/ + void CoarsenLevel(mgLevel_t &Lf, const dfloat theta); + + void SetupSmoother(); + + void AllocateScratch(const int l); + + /*Compute Fiedler vector directly*/ + void FiedlerVector(); + + /*Multigrid functions*/ + void Smooth(memory& r, memory& x, const bool xIsZero); + void Residual(memory& r, memory& x, memory& res); + void Coarsen(memory& x, memory& xC); + void Prolongate(memory& xC, memory& x); +}; + +parCSR TentativeProlongator(const dlong Nf, + const dlong Nc, + platform_t& platform, + comm_t comm, + memory& FineToCoarse, + memory& FineNull, + memory& CoarseNull); + +parCSR SmoothProlongator(const parCSR& A, + const parCSR& T); + +parCSR Transpose(const parCSR& A); + +parCSR SpMM(const parCSR& A, const parCSR& B); + +class coarseSolver_t { + +public: + comm_t comm; + + int N=0; + int Nrows=0; + int Ncols=0; + + int coarseTotal=0; + memory coarseCounts; + memory coarseOffsets; + + memory invA; + memory grhs; + + void Setup(parCSR& A, memory& null); + void Solve(memory& r, memory& x); +}; + +} //namespace paradogs + +} //namespace libp + +#endif + diff --git a/include/mesh/meshDefines2D.h b/include/parAdogs/parAdogsPartition.hpp similarity index 67% rename from include/mesh/meshDefines2D.h rename to include/parAdogs/parAdogsPartition.hpp index 1d656753b..af102c878 100644 --- a/include/mesh/meshDefines2D.h +++ b/include/parAdogs/parAdogsPartition.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -24,32 +24,22 @@ SOFTWARE. */ -#ifndef MESH_DEFINES2D_H -#define MESH_DEFINES2D_H 1 - -/* offsets for geometric factors */ -#define RXID 0 -#define RYID 1 -#define SXID 2 -#define SYID 3 -#define JID 4 -#define JWID 5 -#define IJWID 6 - -/* offsets for second order geometric factors */ -#define G00ID 0 -#define G01ID 1 -#define G11ID 2 -#define GWJID 3 - -/* offsets for nx, ny, sJ, 1/J */ -#define NXID 0 -#define NYID 1 -#define SJID 2 -#define IJID 3 -#define IHID 4 -#define WSJID 5 -#define WIJID 6 +#ifndef PARADOGS_PARTITION_HPP +#define PARADOGS_PARTITION_HPP 1 + +#include "parAdogs.hpp" +#include "parAdogs/parAdogsGraph.hpp" + +namespace libp { + +namespace paradogs { + +dfloat ParallelPivot(const dlong N, memory& F, + const hlong k, comm_t comm); + +} //namespace paradogs + +} //namespace libp #endif diff --git a/include/parAlmond.hpp b/include/parAlmond.hpp index 45ee3a1d7..7d9f2d5a0 100644 --- a/include/parAlmond.hpp +++ b/include/parAlmond.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -35,22 +35,22 @@ SOFTWARE. #include "precon.hpp" #include "linearSolver.hpp" +namespace libp { + namespace parAlmond { -void AddSettings(settings_t& settings, const string prefix=""); +void AddSettings(settings_t& settings, const std::string prefix=""); void ReportSettings(settings_t& settings); -extern MPI_Datatype MPI_NONZERO_T; - //distributed matrix class passed to AMG setup class parCOO { public: - platform_t &platform; - MPI_Comm comm; + platform_t platform; + comm_t comm; dlong nnz=0; - hlong *globalRowStarts=nullptr; - hlong *globalColStarts=nullptr; + memory globalRowStarts; + memory globalColStarts; //non-zero matrix entries struct nonZero_t { @@ -58,70 +58,183 @@ class parCOO { hlong col; dfloat val; }; - nonZero_t *entries=nullptr; + memory entries; - parCOO(platform_t &_platform, MPI_Comm _comm): + parCOO() = default; + parCOO(platform_t &_platform, comm_t _comm): platform(_platform), comm(_comm) {}; - - ~parCOO() { - if(entries) free(entries); - if(globalRowStarts) free(globalRowStarts); - if(globalColStarts) free(globalColStarts); - } }; //abstract multigrid level // Class is derived from solver, and must have Operator defined -class multigridLevel: public solver_t { +class multigridLevel: public operator_t { public: + platform_t platform; + settings_t settings; + comm_t comm; + dlong Nrows=0, Ncols=0; - occa::memory o_scratch; + deviceMemory o_scratch; + multigridLevel() = default; multigridLevel(dlong N, dlong M, platform_t& _platform, - settings_t& _settings): - solver_t(_platform, _settings), Nrows(N), Ncols(M) {} - virtual ~multigridLevel() {}; - - virtual void smooth(occa::memory& o_rhs, occa::memory& o_x, bool x_is_zero)=0; - virtual void residual(occa::memory& o_rhs, occa::memory& o_x, occa::memory& o_res)=0; - virtual void coarsen(occa::memory& o_x, occa::memory& o_Cx)=0; - virtual void prolongate(occa::memory& o_x, occa::memory& o_Px)=0; + settings_t& _settings, comm_t _comm): + platform(_platform), settings(_settings), + comm(_comm), Nrows(N), Ncols(M) {} + + virtual void smooth(deviceMemory& o_rhs, deviceMemory& o_x, bool x_is_zero)=0; + virtual void residual(deviceMemory& o_rhs, deviceMemory& o_x, deviceMemory& o_res)=0; + virtual void coarsen(deviceMemory& o_x, deviceMemory& o_Cx)=0; + virtual void prolongate(deviceMemory& o_x, deviceMemory& o_Px)=0; virtual void Report()=0; }; -//forward declaration +typedef enum {VCYCLE=0,KCYCLE=1,EXACT=3} CycleType; +typedef enum {SMOOTHED=0,UNSMOOTHED=1} AggType; +typedef enum {PCG=0,GMRES=1} KrylovType; +typedef enum {DAMPED_JACOBI=0,CHEBYSHEV=1} SmoothType; +typedef enum {RUGESTUBEN=0,SYMMETRIC=1} StrengthType; +typedef enum {COARSEEXACT=0,COARSEOAS=1} CoarseType; + +class coarseSolver_t; + //multigrid preconditioner -class multigrid_t; +class multigrid_t: public operator_t { +public: + platform_t platform; + settings_t settings; + comm_t comm; + + bool exact=false; + linearSolver_t linearSolver; + + CycleType ctype; + AggType aggtype; + StrengthType strtype; + CoarseType coarsetype; + + int numLevels=0; + int baseLevel=0; + static constexpr int PARALMOND_MAX_LEVELS=100; + std::shared_ptr levels[PARALMOND_MAX_LEVELS]; + + deviceMemory o_rhs[PARALMOND_MAX_LEVELS]; + deviceMemory o_x[PARALMOND_MAX_LEVELS]; + + std::shared_ptr coarseSolver; + + //scratch space for smoothing and temporary residual vector + size_t NscratchSpace=0; + deviceMemory o_scratch; + + KrylovType ktype; + + deviceMemory o_ck[PARALMOND_MAX_LEVELS]; + deviceMemory o_vk[PARALMOND_MAX_LEVELS]; + deviceMemory o_wk[PARALMOND_MAX_LEVELS]; + + //scratch space + size_t NreductionScratch=0; + pinnedMemory reductionScratch; + deviceMemory o_reductionScratch; -class parAlmond_t: public precon_t { + multigrid_t() = default; + multigrid_t(platform_t& _platform, settings_t& _settings, + comm_t _comm); + + template + Level& AddLevel(Args&& ... args) { + levels[numLevels++] = std::make_shared(args...); + AllocateLevelWorkSpace(numLevels-1); + return dynamic_cast(*levels[numLevels-1]); + } + template + Level& GetLevel(const int l) { + return dynamic_cast(*levels[l]); + } + + void AllocateLevelWorkSpace(const int k); + + void Operator(deviceMemory& o_RHS, deviceMemory& o_X); + + void vcycle(const int k, deviceMemory& o_RHS, deviceMemory& o_X); + void kcycle(const int k, deviceMemory& o_RHS, deviceMemory& o_X); + +private: + void kcycleOp1(multigridLevel& level, + deviceMemory& o_X, deviceMemory& o_RHS, + deviceMemory& o_CK, deviceMemory& o_VK, + dfloat& alpha1, dfloat& rho1, + dfloat& norm_rhs, dfloat& norm_rhstilde); + + void kcycleOp2(multigridLevel& level, + deviceMemory& o_X, deviceMemory& o_RHS, + deviceMemory& o_CK, deviceMemory& o_VK, deviceMemory& o_WK, + const dfloat alpha1, const dfloat rho1); + + void kcycleCombinedOp1(multigridLevel& level, + deviceMemory& o_a, + deviceMemory& o_b, + deviceMemory& o_c, + dfloat& aDotb, + dfloat& aDotc, + dfloat& bDotb); + void kcycleCombinedOp2(multigridLevel& level, + deviceMemory& o_a, + deviceMemory& o_b, + deviceMemory& o_c, + deviceMemory& o_d, + dfloat& aDotb, + dfloat& aDotc, + dfloat& aDotd); + dfloat vectorAddInnerProd(multigridLevel& level, + const dfloat alpha, deviceMemory& o_x, + const dfloat beta, deviceMemory& o_y); +}; + +class parAlmond_t: public operator_t { public: - parAlmond_t(platform_t& _platform, settings_t& settings_, MPI_Comm comm); - ~parAlmond_t(); + parAlmond_t() = default; + parAlmond_t(platform_t& _platform, settings_t& _settings, comm_t _comm) { + Setup(_platform, _settings, _comm); + } + + void Setup(platform_t& _platform, settings_t& _settings, comm_t _comm); - //Add level to multigrid heirarchy - void AddLevel(multigridLevel* level); + template + Level& AddLevel(Args&& ... args) { + return multigrid->AddLevel(args...); + } + template + Level& GetLevel(const int l) { + return multigrid->GetLevel(l); + } + + int NumLevels(); // Setup AMG //-- Local A matrix data must be globally indexed & row sorted void AMGSetup(parCOO& A, bool nullSpace, - dfloat *nullVector, + memory nullVector, dfloat nullSpacePenalty); - void Operator(occa::memory& o_rhs, occa::memory& o_x); + void Operator(deviceMemory& o_rhs, deviceMemory& o_x); void Report(); dlong getNumCols(int k); dlong getNumRows(int k); private: - platform_t& platform; - settings_t& settings; + platform_t platform; + settings_t settings; - multigrid_t *multigrid=nullptr; + std::shared_ptr multigrid=nullptr; }; } //namespace parAlmond +} //namespace libp + #endif diff --git a/include/parAlmond/parAlmondAMGLevel.hpp b/include/parAlmond/parAlmondAMGLevel.hpp index 4f1f556a5..9cb5a5113 100644 --- a/include/parAlmond/parAlmondAMGLevel.hpp +++ b/include/parAlmond/parAlmondAMGLevel.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -30,30 +30,31 @@ SOFTWARE. #include "parAlmond.hpp" #include "parAlmond/parAlmondparCSR.hpp" +namespace libp { namespace parAlmond { class amgLevel: public multigridLevel { public: - parCSR *A=nullptr, *P=nullptr, *R=nullptr; + parCSR A, P, R; SmoothType stype; dfloat lambda, lambda1, lambda0; //smoothing params int ChebyshevIterations=2; - amgLevel(parCSR *AA, settings_t& _settings); - ~amgLevel(); + amgLevel() = default; + amgLevel(parCSR& AA, settings_t& _settings); - void Operator(occa::memory& o_x, occa::memory& o_Ax); - void residual(occa::memory& o_rhs, occa::memory& o_x, occa::memory& o_res); - void coarsen(occa::memory& o_x, occa::memory& o_Cx); - void prolongate(occa::memory& o_x, occa::memory& o_Px); + void Operator(deviceMemory& o_x, deviceMemory& o_Ax); + void residual(deviceMemory& o_rhs, deviceMemory& o_x, deviceMemory& o_res); + void coarsen(deviceMemory& o_x, deviceMemory& o_Cx); + void prolongate(deviceMemory& o_x, deviceMemory& o_Px); - void smooth(occa::memory& o_rhs, occa::memory& o_x, bool x_is_zero); - void smoothDampedJacobi(occa::memory& o_r, occa::memory& o_x, bool x_is_zero); - void smoothChebyshev(occa::memory& o_r, occa::memory& o_x, bool x_is_zero); + void smooth(deviceMemory& o_rhs, deviceMemory& o_x, bool x_is_zero); + void smoothDampedJacobi(deviceMemory& o_r, deviceMemory& o_x, bool x_is_zero); + void smoothChebyshev(deviceMemory& o_r, deviceMemory& o_x, bool x_is_zero); void Report(); @@ -62,6 +63,8 @@ class amgLevel: public multigridLevel { void syncToDevice(); }; -} +} //namespace parAlmond + +} //namespace libp #endif diff --git a/include/parAlmond/parAlmondAMGSetup.hpp b/include/parAlmond/parAlmondAMGSetup.hpp index 58c80c203..442cc2214 100644 --- a/include/parAlmond/parAlmondAMGSetup.hpp +++ b/include/parAlmond/parAlmondAMGSetup.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -28,52 +28,50 @@ SOFTWARE. #define PARALMOND_AMGSETUP_HPP #include "parAlmond.hpp" -#include "parAlmond/parAlmondMultigrid.hpp" #include "parAlmond/parAlmondAMGLevel.hpp" +namespace libp { namespace parAlmond { class strongGraph_t { public: - platform_t& platform; - MPI_Comm comm; + platform_t platform; + comm_t comm; dlong Nrows=0; dlong Ncols=0; dlong nnz=0; - dlong *rowStarts=nullptr; - dlong *cols=nullptr; + memory rowStarts; + memory cols; - strongGraph_t(dlong N, dlong M, platform_t& _platform, MPI_Comm _comm): + strongGraph_t(dlong N, dlong M, platform_t& _platform, comm_t _comm): platform(_platform), comm(_comm), Nrows(N), Ncols(M) {} - ~strongGraph_t() { - if (rowStarts) free(rowStarts); - if (cols) free(cols); - } }; -amgLevel *coarsenAmgLevel(amgLevel *level, dfloat *null, - StrengthType strtype, dfloat theta, - AggType aggtype); +amgLevel coarsenAmgLevel(amgLevel& level, memory& null, + StrengthType strtype, dfloat theta, + AggType aggtype); -strongGraph_t* strongGraph(parCSR *A, StrengthType type, dfloat theta); +strongGraph_t strongGraph(parCSR& A, StrengthType type, dfloat theta); -void formAggregates(parCSR *A, strongGraph_t *C, - hlong* FineToCoarse, - hlong* globalAggStarts); +void formAggregates(parCSR& A, strongGraph_t& C, + memory FineToCoarse, + memory globalAggStarts); -parCSR *tentativeProlongator(parCSR *A, hlong *FineToCoarse, - hlong *globalAggStarts, dfloat *null); +parCSR tentativeProlongator(parCSR& A, memory FineToCoarse, + memory globalAggStarts, memory null); -parCSR *smoothProlongator(parCSR *A, parCSR *T); +parCSR smoothProlongator(parCSR& A, parCSR& T); -parCSR *transpose(parCSR *A); +parCSR transpose(parCSR& A); -parCSR *SpMM(parCSR *A, parCSR *B); +parCSR SpMM(parCSR& A, parCSR& B); -parCSR *galerkinProd(parCSR *A, parCSR *P); +parCSR galerkinProd(parCSR& A, parCSR& P); -} +} //namespace parAlmond + +} //namespace libp #endif diff --git a/include/parAlmond/parAlmondCoarseSolver.hpp b/include/parAlmond/parAlmondCoarseSolver.hpp index 8ff955633..3cd08c791 100644 --- a/include/parAlmond/parAlmondCoarseSolver.hpp +++ b/include/parAlmond/parAlmondCoarseSolver.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -34,100 +34,105 @@ SOFTWARE. #include "parAlmond/parAlmondDefines.hpp" #include "parAlmond/parAlmondparCSR.hpp" +namespace libp { + namespace parAlmond { -class coarseSolver_t: public solver_t { +class coarseSolver_t: public operator_t { public: + platform_t platform; + settings_t settings; + comm_t comm; + int Nrows; int Ncols; - MPI_Comm comm; int rank, size; coarseSolver_t(platform_t& _platform, settings_t& _settings, - MPI_Comm _comm): - solver_t(_platform, _settings), comm(_comm) {} - virtual ~coarseSolver_t() {} + comm_t _comm): + platform(_platform), settings(_settings), + comm(_comm) {} virtual int getTargetSize()=0; - virtual void setup(parCSR *A, bool nullSpace, - dfloat *nullVector, dfloat nullSpacePenalty)=0; + virtual void setup(parCSR& A, bool nullSpace, + memory nullVector, dfloat nullSpacePenalty)=0; virtual void syncToDevice()=0; virtual void Report(int lev)=0; - virtual void solve(occa::memory& o_rhs, occa::memory& o_x)=0; + virtual void solve(deviceMemory& o_rhs, deviceMemory& o_x)=0; }; class exactSolver_t: public coarseSolver_t { public: - parCSR *A=nullptr; + parCSR A; int coarseTotal; int coarseOffset; - int *coarseOffsets=nullptr; - int *coarseCounts=nullptr; - int *sendOffsets=nullptr; - int *sendCounts=nullptr; + memory coarseOffsets; + memory coarseCounts; + memory sendOffsets; + memory sendCounts; int N; int offdTotal=0; - dfloat *diagInvAT=nullptr, *offdInvAT=nullptr; - occa::memory o_diagInvAT, o_offdInvAT; + memory diagInvAT, offdInvAT; + deviceMemory o_diagInvAT, o_offdInvAT; - dfloat *diagRhs=nullptr, *offdRhs=nullptr; - occa::memory o_offdRhs; + memory diagRhs, offdRhs; + deviceMemory o_offdRhs; exactSolver_t(platform_t& _platform, settings_t& _settings, - MPI_Comm _comm): + comm_t _comm): coarseSolver_t(_platform, _settings, _comm) {} - ~exactSolver_t(); int getTargetSize(); - void setup(parCSR *A, bool nullSpace, - dfloat *nullVector, dfloat nullSpacePenalty); + void setup(parCSR& A, bool nullSpace, + memory nullVector, dfloat nullSpacePenalty); void syncToDevice(); void Report(int lev); - void solve(occa::memory& o_rhs, occa::memory& o_x); + void solve(deviceMemory& o_rhs, deviceMemory& o_x); }; class oasSolver_t: public coarseSolver_t { public: - parCSR* A; + parCSR A; int N; int diagTotal=0, offdTotal=0; - dfloat *diagInvAT=nullptr, *offdInvAT=nullptr; - occa::memory o_diagInvAT, o_offdInvAT; + memory diagInvAT, offdInvAT; + deviceMemory o_diagInvAT, o_offdInvAT; oasSolver_t(platform_t& _platform, settings_t& _settings, - MPI_Comm _comm): + comm_t _comm): coarseSolver_t(_platform, _settings, _comm) {} - ~oasSolver_t(); int getTargetSize(); - void setup(parCSR *A, bool nullSpace, - dfloat *nullVector, dfloat nullSpacePenalty); + void setup(parCSR& A, bool nullSpace, + memory nullVector, dfloat nullSpacePenalty); void syncToDevice(); void Report(int lev); - void solve(occa::memory& o_rhs, occa::memory& o_x); + void solve(deviceMemory& o_rhs, deviceMemory& o_x); }; -} +} //namespace parAlmond + +} //namespace libp -#endif \ No newline at end of file +#endif diff --git a/include/parAlmond/parAlmondDefines.hpp b/include/parAlmond/parAlmondDefines.hpp index d60afa9a6..5aaa72d2a 100644 --- a/include/parAlmond/parAlmondDefines.hpp +++ b/include/parAlmond/parAlmondDefines.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,8 +27,16 @@ SOFTWARE. #ifndef PARALMOND_DEFINES_HPP #define PARALMOND_DEFINES_HPP -#define PARALMOND_NBLOCKS 128 -#define NUMKCYCLES 3 -#define KCYCLETOL 0.2 +namespace libp { -#endif \ No newline at end of file +namespace parAlmond { + +constexpr int PARALMOND_NBLOCKS=128; +constexpr int NUMKCYCLES=3; +constexpr dfloat KCYCLETOL=0.2; + +} //namespace parAlmond + +} //namespace libp + +#endif diff --git a/include/parAlmond/parAlmondKernels.hpp b/include/parAlmond/parAlmondKernels.hpp index f6d388389..80efb2280 100644 --- a/include/parAlmond/parAlmondKernels.hpp +++ b/include/parAlmond/parAlmondKernels.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,38 +27,41 @@ SOFTWARE. #ifndef PARALMOND_KERNELS_HPP #define PARALMOND_KERNELS_HPP +namespace libp { + namespace parAlmond { void buildParAlmondKernels(platform_t& platform); void freeParAlmondKernels(); - extern int Nrefs; - - extern const int blockSize; - extern const int NonzerosPerBlock; + //NC: Hard code these for now. Should be sufficient for GPU devices, but needs attention for CPU + constexpr int blockSize = 256; + constexpr int NonzerosPerBlock = 2048; //should be a multiple of blockSize for good unrolling - extern occa::kernel SpMVcsrKernel1; - extern occa::kernel SpMVcsrKernel2; - extern occa::kernel SpMVmcsrKernel; + extern kernel_t SpMVcsrKernel1; + extern kernel_t SpMVcsrKernel2; + extern kernel_t SpMVmcsrKernel; - extern occa::kernel SmoothJacobiCSRKernel; - extern occa::kernel SmoothJacobiMCSRKernel; + extern kernel_t SmoothJacobiCSRKernel; + extern kernel_t SmoothJacobiMCSRKernel; - extern occa::kernel SmoothChebyshevStartKernel; - extern occa::kernel SmoothChebyshevCSRKernel; - extern occa::kernel SmoothChebyshevMCSRKernel; - extern occa::kernel SmoothChebyshevUpdateKernel; + extern kernel_t SmoothChebyshevStartKernel; + extern kernel_t SmoothChebyshevCSRKernel; + extern kernel_t SmoothChebyshevMCSRKernel; + extern kernel_t SmoothChebyshevUpdateKernel; - extern occa::kernel vectorAddInnerProdKernel; - extern occa::kernel vectorAddWeightedInnerProdKernel; - extern occa::kernel kcycleCombinedOp1Kernel; - extern occa::kernel kcycleCombinedOp2Kernel; - extern occa::kernel kcycleWeightedCombinedOp1Kernel; - extern occa::kernel kcycleWeightedCombinedOp2Kernel; + extern kernel_t vectorAddInnerProdKernel; + extern kernel_t vectorAddWeightedInnerProdKernel; + extern kernel_t kcycleCombinedOp1Kernel; + extern kernel_t kcycleCombinedOp2Kernel; + extern kernel_t kcycleWeightedCombinedOp1Kernel; + extern kernel_t kcycleWeightedCombinedOp2Kernel; - extern occa::kernel dGEMVKernel; + extern kernel_t dGEMVKernel; } //namespace parAlmond -#endif \ No newline at end of file +} // namespace libp + +#endif diff --git a/include/parAlmond/parAlmondMultigrid.hpp b/include/parAlmond/parAlmondMultigrid.hpp deleted file mode 100644 index 01aabfbe2..000000000 --- a/include/parAlmond/parAlmondMultigrid.hpp +++ /dev/null @@ -1,123 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#ifndef PARALMOND_MULTIGRID_HPP -#define PARALMOND_MULTIGRID_HPP - -#include "settings.hpp" -#include "platform.hpp" -#include "solver.hpp" -#include "precon.hpp" -#include "parAlmond.hpp" -#include "parAlmond/parAlmondDefines.hpp" -#include "parAlmond/parAlmondCoarseSolver.hpp" - -namespace parAlmond { - -#define PARALMOND_MAX_LEVELS 100 - -typedef enum {VCYCLE=0,KCYCLE=1,EXACT=3} CycleType; -typedef enum {SMOOTHED=0,UNSMOOTHED=1} AggType; -typedef enum {PCG=0,GMRES=1} KrylovType; -typedef enum {DAMPED_JACOBI=0,CHEBYSHEV=1} SmoothType; -typedef enum {RUGESTUBEN=0,SYMMETRIC=1} StrengthType; -typedef enum {COARSEEXACT=0,COARSEOAS=1} CoarseType; - -//multigrid preconditioner -class multigrid_t: public precon_t { -public: - platform_t& platform; - settings_t& settings; - MPI_Comm comm; - - bool exact; - linearSolver_t *linearSolver=nullptr; - - CycleType ctype; - AggType aggtype; - StrengthType strtype; - CoarseType coarsetype; - - int numLevels=0; - int baseLevel=0; - multigridLevel *levels[PARALMOND_MAX_LEVELS]; - - occa::memory o_rhs[PARALMOND_MAX_LEVELS]; - occa::memory o_x[PARALMOND_MAX_LEVELS]; - - coarseSolver_t *coarseSolver; - - //scratch space for smoothing and temporary residual vector - size_t scratchSpaceBytes=0; - occa::memory o_scratch; - - KrylovType ktype; - - occa::memory o_ck[PARALMOND_MAX_LEVELS]; - occa::memory o_vk[PARALMOND_MAX_LEVELS]; - occa::memory o_wk[PARALMOND_MAX_LEVELS]; - - //scratch space - size_t reductionScratchBytes=0; - void *reductionScratch=nullptr; - occa::memory h_reductionScratch; - occa::memory o_reductionScratch; - - multigrid_t(platform_t& _platform, settings_t& _settings, MPI_Comm _comm); - ~multigrid_t(); - - void AddLevel(multigridLevel* level); - - void Operator(occa::memory& o_RHS, occa::memory& o_X); - - void vcycle(const int k, occa::memory& o_RHS, occa::memory& o_X); - void kcycle(const int k, occa::memory& o_RHS, occa::memory& o_X); - -private: - void kcycleOp1(multigridLevel* level, - occa::memory& o_X, occa::memory& o_RHS, - occa::memory& o_CK, occa::memory& o_VK, - dfloat *alpha1, dfloat *rho1, - dfloat *norm_rhs, dfloat *norm_rhstilde); - - void kcycleOp2(multigridLevel* level, - occa::memory& o_X, occa::memory& o_RHS, - occa::memory& o_CK, occa::memory& o_VK, occa::memory& o_WK, - const dfloat alpha1, const dfloat rho1); - - void kcycleCombinedOp1(multigridLevel* level, dfloat *aDotbc, occa::memory& o_a, - occa::memory& o_b, occa::memory& o_c); - void kcycleCombinedOp2(multigridLevel* level, dfloat *aDotbcd, - occa::memory& o_a, occa::memory& o_b, - occa::memory& o_c, occa::memory& o_d); - dfloat vectorAddInnerProd(multigridLevel* level, - const dfloat alpha, occa::memory& o_x, - const dfloat beta, occa::memory& o_y); -}; - -} - -#endif \ No newline at end of file diff --git a/include/parAlmond/parAlmondparCSR.hpp b/include/parAlmond/parAlmondparCSR.hpp index 48a89c3f0..893a1ee07 100644 --- a/include/parAlmond/parAlmondparCSR.hpp +++ b/include/parAlmond/parAlmondparCSR.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,30 +27,32 @@ SOFTWARE. #ifndef PARALMOND_PARCSR_HPP #define PARALMOND_PARCSR_HPP +namespace libp { + namespace parAlmond { class parCSR { public: - platform_t& platform; - MPI_Comm comm; + platform_t platform; + comm_t comm; - dlong Nrows; - dlong Ncols; + dlong Nrows=0; + dlong Ncols=0; //local sparse matrix struct CSR { dlong nnz=0; dlong NrowBlocks=0; - dlong *blockRowStarts=nullptr; - dlong *rowStarts=nullptr; - dlong *cols=nullptr; - pfloat *vals=nullptr; + memory blockRowStarts; + memory rowStarts; + memory cols; + memory vals; - occa::memory o_blockRowStarts; - occa::memory o_rowStarts; - occa::memory o_cols; - occa::memory o_vals; + deviceMemory o_blockRowStarts; + deviceMemory o_rowStarts; + deviceMemory o_cols; + deviceMemory o_vals; }; CSR diag; @@ -60,47 +62,46 @@ class parCSR { dlong nzRows=0; dlong NrowBlocks=0; - dlong *blockRowStarts=nullptr; - dlong *rowStarts=nullptr; - dlong *mRowStarts=nullptr; //compressed version of rowStarts - dlong *rows=nullptr; - dlong *cols=nullptr; - pfloat *vals=nullptr; - - occa::memory o_blockRowStarts; - occa::memory o_mRowStarts; - occa::memory o_rows; - occa::memory o_cols; - occa::memory o_vals; + memory blockRowStarts; + memory rowStarts; + memory mRowStarts; //compressed version of rowStarts + memory rows; + memory cols; + memory vals; + + deviceMemory o_blockRowStarts; + deviceMemory o_mRowStarts; + deviceMemory o_rows; + deviceMemory o_cols; + deviceMemory o_vals; }; MCSR offd; - dfloat *diagA=nullptr; - dfloat *diagInv=nullptr; + memory diagA; + memory diagInv; - occa::memory o_diagA; - occa::memory o_diagInv; + deviceMemory o_diagA; + deviceMemory o_diagInv; //partition info - hlong *globalRowStarts=nullptr; - hlong *globalColStarts=nullptr; - hlong *colMap=nullptr; + memory globalRowStarts; + memory globalColStarts; + memory colMap; - halo_t *halo = nullptr; + ogs::halo_t halo; dlong NlocalCols = 0; //rho ~= cond(invD * A) dfloat rho=0.0; - parCSR(dlong N, dlong M, platform_t& _platform, MPI_Comm _comm): + parCSR() = default; + parCSR(dlong N, dlong M, platform_t& _platform, comm_t _comm): platform(_platform), comm(_comm), Nrows(N), Ncols(M) {} //build a parCSR matrix from a distributed COO matrix parCSR(parCOO& A); - ~parCSR(); - - void haloSetup(hlong *colIds); + void haloSetup(memory colIds); void diagSetup(); @@ -108,28 +109,28 @@ class parCSR { void syncToDevice(); - void SpMV(const dfloat alpha, dfloat *x, - const dfloat beta, dfloat *y); - void SpMV(const dfloat alpha, dfloat *x, - const dfloat beta, const dfloat *y, dfloat *z); + void SpMV(const dfloat alpha, memory& x, + const dfloat beta, memory& y); + void SpMV(const dfloat alpha, memory& x, + const dfloat beta, const memory& y, memory& z); - void SpMV(const dfloat alpha, occa::memory& o_x, const dfloat beta, - occa::memory& o_y); - void SpMV(const dfloat alpha, occa::memory& o_x, const dfloat beta, - occa::memory& o_y, occa::memory& o_z); + void SpMV(const dfloat alpha, deviceMemory& o_x, const dfloat beta, + deviceMemory& o_y); + void SpMV(const dfloat alpha, deviceMemory& o_x, const dfloat beta, + deviceMemory& o_y, deviceMemory& o_z); - void smoothDampedJacobi(occa::memory& o_r, occa::memory& o_x, + void smoothDampedJacobi(deviceMemory& o_r, deviceMemory& o_x, const dfloat lambda, bool x_is_zero, - occa::memory& o_scratch); + deviceMemory& o_scratch); - void smoothChebyshev(occa::memory& o_b, occa::memory& o_x, + void smoothChebyshev(deviceMemory& o_b, deviceMemory& o_x, const dfloat lambda0, const dfloat lambda1, - bool x_is_zero, occa::memory& o_scratch, + bool x_is_zero, deviceMemory& o_scratch, const int ChebyshevIterations); }; - - } //namespace parAlmond +} //namespace libp + #endif diff --git a/include/platform.hpp b/include/platform.hpp index 4eaffef08..0dc1761c3 100644 --- a/include/platform.hpp +++ b/include/platform.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -28,40 +28,58 @@ SOFTWARE. #define PLATFORM_HPP #define LIBP_MAJOR_VERSION 0 -#define LIBP_MINOR_VERSION 4 +#define LIBP_MINOR_VERSION 5 #define LIBP_PATCH_VERSION 0 -#define LIBP_VERSION 00400 -#define LIBP_VERSION_STR "0.4.0" +#define LIBP_VERSION 00500 +#define LIBP_VERSION_STR "0.5.0" #include "core.hpp" +#include "memory.hpp" +#include "comm.hpp" #include "settings.hpp" #include "linAlg.hpp" +namespace libp { + class platformSettings_t: public settings_t { public: - platformSettings_t(MPI_Comm _comm); + platformSettings_t(comm_t _comm); void report(); }; +namespace internal { + +class iplatform_t { +public: + platformSettings_t settings; + properties_t props; + + iplatform_t(platformSettings_t& _settings): + settings(_settings) { + } +}; + +} //namespace internal + class platform_t { public: - const MPI_Comm& comm; - platformSettings_t& settings; - occa::properties props; + private: + std::shared_ptr iplatform; + std::shared_ptr ilinAlg; - occa::device device; - linAlg_t linAlg; + public: + comm_t comm; + device_t device; - int rank, size; + platform_t()=default; - platform_t(platformSettings_t& _settings): - comm(_settings.comm), - settings(_settings) { + platform_t(platformSettings_t& _settings) { + + iplatform = std::make_shared(_settings); - MPI_Comm_rank(comm, &rank); - MPI_Comm_size(comm, &size); + comm = settings().comm; - if (rank==0) { + if (comm.rank()==0) { std::cout << "\n"; std::cout << "\033[1m"; std::cout << " _ _ _ ____ _ \n"; @@ -80,44 +98,140 @@ class platform_t { DeviceConfig(); DeviceProperties(); - linAlg.Setup(this); + ilinAlg = std::make_shared(this); + } + + platform_t(const platform_t &other)=default; + platform_t& operator = (const platform_t &other)=default; + + bool isInitialized() { + return (iplatform!=nullptr); + } + + void assertInitialized() { + LIBP_ABORT("Platform not initialized.", + !isInitialized()); + } + + kernel_t buildKernel(std::string fileName, std::string kernelName, + properties_t& kernelInfo); + + template + deviceMemory malloc(const size_t count, + const properties_t &prop = properties_t()) { + assertInitialized(); + if (occa::dtype::get() == occa::dtype::none) { + return deviceMemory(device.malloc(count*sizeof(T), occa::dtype::byte, prop)); + } else { + return deviceMemory(device.malloc(count, prop)); + } + } + + template + deviceMemory malloc(const size_t count, + const memory src, + const properties_t &prop = properties_t()) { + assertInitialized(); + if (occa::dtype::get() == occa::dtype::none) { + return deviceMemory(device.malloc(count*sizeof(T), occa::dtype::byte, src.ptr(), prop)); + } else { + return deviceMemory(device.malloc(count, src.ptr(), prop)); + } + } + + template + deviceMemory malloc(const memory src, + const properties_t &prop = properties_t()) { + assertInitialized(); + if (occa::dtype::get() == occa::dtype::none) { + return deviceMemory(device.malloc(src.size(), occa::dtype::byte, src.ptr(), prop)); + } else { + return deviceMemory(device.malloc(src.length(), src.ptr(), prop)); + } + } + + template + pinnedMemory hostMalloc(const size_t count){ + assertInitialized(); + properties_t hostProp("host", true); + if (occa::dtype::get() == occa::dtype::none) { + return pinnedMemory(device.malloc(count*sizeof(T), occa::dtype::byte, nullptr, hostProp)); + } else { + return pinnedMemory(device.malloc(count, nullptr, hostProp)); + } + } + + template + pinnedMemory hostMalloc(const size_t count, + const memory src){ + assertInitialized(); + properties_t hostProp("host", true); + if (occa::dtype::get() == occa::dtype::none) { + return pinnedMemory(device.malloc(count*sizeof(T), occa::dtype::byte, src.ptr(), hostProp)); + } else { + return pinnedMemory(device.malloc(count, src.ptr(), hostProp)); + } } - ~platform_t(){} + template + pinnedMemory hostMalloc(const memory src){ + assertInitialized(); + properties_t hostProp("host", true); + if (occa::dtype::get() == occa::dtype::none) { + return pinnedMemory(device.malloc(src.size(), occa::dtype::byte, src.ptr(), hostProp)); + } else { + return pinnedMemory(device.malloc(src.length(), src.ptr(), hostProp)); + } + } - occa::kernel buildKernel(std::string fileName, std::string kernelName, - occa::properties& kernelInfo); + linAlg_t& linAlg() { + assertInitialized(); + return *ilinAlg; + } - occa::memory malloc(const size_t bytes, - const void *src = NULL, - const occa::properties &prop = occa::properties()) { - return device.malloc(bytes, src, prop); + settings_t& settings() { + assertInitialized(); + return iplatform->settings; } - occa::memory malloc(const size_t bytes, - const occa::memory &src, - const occa::properties &prop = occa::properties()) { - return device.malloc(bytes, src, prop); + properties_t& props() { + assertInitialized(); + return iplatform->props; } - occa::memory malloc(const size_t bytes, - const occa::properties &prop) { - return device.malloc(bytes, prop); + void finish() { + device.finish(); } - void *hostMalloc(const size_t bytes, - const void *src, - occa::memory &h_mem){ - occa::properties hostProp; - hostProp["host"] = true; - h_mem = device.malloc(bytes, src, hostProp); - return h_mem.ptr(); + stream_t getStream() { + return device.getStream(); } -private: + void setStream(stream_t stream) { + device.setStream(stream); + } + + const int rank() const { + return comm.rank(); + } + + const int size() const { + return comm.size(); + } + + int getDeviceCount(const std::string mode) { + return occa::getDeviceCount(mode); + } + + void setCacheDir(const std::string cacheDir) { + occa::env::setOccaCacheDir(cacheDir); + } + + private: void DeviceConfig(); void DeviceProperties(); - }; -#endif \ No newline at end of file +} //namespace libp + +#endif diff --git a/include/precon.hpp b/include/precon.hpp index d3975b7f0..4be0e32ae 100644 --- a/include/precon.hpp +++ b/include/precon.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -28,28 +28,46 @@ SOFTWARE. #define PRECON_HPP #include "core.hpp" +#include "operator.hpp" -//base preconditioner -class precon_t { -public: - precon_t() {}; +namespace libp { + +/*Abstracted Preconditioner Object*/ +class precon_t: public operator_t { + public: + void Operator(deviceMemory &o_r, deviceMemory &o_Mr) { + assertInitialized(); + precon->Operator(o_r, o_Mr); + } - virtual void Operator(occa::memory &o_r, occa::memory &o_Mr)=0; + /*Generic setup. Create a Precon object and wrap it in a shared_ptr*/ + template + void Setup(Args&& ... args) { + precon = std::make_shared(args...); + } - virtual ~precon_t() {} + private: + std::shared_ptr precon=nullptr; + + void assertInitialized() { + LIBP_ABORT("Precon not initialized", + precon==nullptr); + } }; //Identity operator -class IdentityPrecon: public precon_t { +class IdentityPrecon: public operator_t { private: dlong N; public: IdentityPrecon(dlong _N): N(_N) {} - void Operator(occa::memory &o_r, occa::memory &o_Mr){ - o_Mr.copyFrom(o_r, N*sizeof(dfloat)); //identity + void Operator(deviceMemory &o_r, deviceMemory &o_Mr){ + o_Mr.copyFrom(o_r, N); //identity } }; +} //namespace libp + #endif diff --git a/include/settings.hpp b/include/settings.hpp index aa404f64a..843e5c55e 100644 --- a/include/settings.hpp +++ b/include/settings.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -28,28 +28,30 @@ SOFTWARE. #define SETTINGS_HPP #include +#include #include #include #include #include #include "core.hpp" -using std::string; -using std::vector; -using std::ostream; -using std::stringstream; +namespace libp { class setting_t { + using string = std::string; + using stringstream = std::stringstream; + private: string name; string val; string description; - vector options; + std::vector options; public: setting_t() = default; - setting_t(string name_, string val_, string description_="", vector options_={}); + setting_t(string name_, string val_, + string description_="", std::vector options_={}); ~setting_t() = default; @@ -58,7 +60,7 @@ class setting_t { const string& getName() const; const string& getDescription() const; - const vector& getOptions() const; + const std::vector& getOptions() const; template T getVal() const { @@ -75,28 +77,29 @@ class setting_t { string toString() const; }; -ostream& operator<<(ostream& os, const setting_t& setting); +std::ostream& operator<<(std::ostream& os, const setting_t& setting); class settings_t { + using string = std::string; + using stringstream = std::stringstream; + private: - vector insertOrder; + std::vector insertOrder; public: - const MPI_Comm comm; - std::map settings; + comm_t comm; + std::map settings; - settings_t() = delete; - settings_t(MPI_Comm _comm); - - ~settings_t(); + settings_t() = default; + settings_t(comm_t _comm); //copy - settings_t(const settings_t& other); - settings_t& operator=(const settings_t& other); + settings_t(const settings_t& other)=default; + settings_t& operator=(const settings_t& other)=default; void newSetting(const string name, const string val, - const string description="", - const vector options={}); + const string description="", + const std::vector options={}); bool hasSetting(const string name); @@ -109,12 +112,10 @@ class settings_t { void getSetting(const string name, T& value) const { auto search = settings.find(name); if (search != settings.end()) { - setting_t* val = search->second; - value = val->getVal(); + const setting_t& val = search->second; + value = val.getVal(); } else { - stringstream ss; - ss << "Unable to find setting: [" << name << "]"; - LIBP_ABORT(ss.str()); + LIBP_FORCE_ABORT("Unable to find setting: [" << name << "]"); } } @@ -127,6 +128,6 @@ class settings_t { void reportSetting(const string name) const; }; - +} //namespace libp #endif diff --git a/include/solver.hpp b/include/solver.hpp index 4117b5c9e..d2fc37239 100644 --- a/include/solver.hpp +++ b/include/solver.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -29,77 +29,82 @@ SOFTWARE. #include "settings.hpp" #include "platform.hpp" +#include "operator.hpp" -class solver_t { +namespace libp { + +class solver_t: public operator_t { public: - platform_t& platform; - settings_t& settings; + platform_t platform; + settings_t settings; + comm_t comm; - solver_t() = delete; + solver_t() = default; - solver_t(platform_t& _platform, settings_t& _settings): + solver_t(platform_t& _platform, settings_t& _settings, comm_t _comm): platform(_platform), - settings(_settings) {}; - - virtual ~solver_t(){} + settings(_settings), + comm(_comm) {}; virtual void Run() { - LIBP_ABORT(string("Run not implemented in this solver")) + LIBP_FORCE_ABORT("Run not implemented in this solver"); }; - virtual void Report(dfloat time=0.0, int tstep=0) { - LIBP_ABORT(string("Report not implemented in this solver")) + virtual void Report(dfloat time, int tstep) { + LIBP_FORCE_ABORT("Report not implemented in this solver"); } //Full rhs evaluation of solver in form dq/dt = rhsf(q,t) - virtual void rhsf(occa::memory& o_q, occa::memory& o_rhs, const dfloat time) { - LIBP_ABORT(string("rhsf not implemented in this solver")) + virtual void rhsf(deviceMemory& o_q, deviceMemory& o_rhs, const dfloat time) { + LIBP_FORCE_ABORT("rhsf not implemented in this solver"); } // Partial rhs evaluation of f with solver in form dq/dt = f(q,t) + g(q,t) - virtual void rhs_imex_f(occa::memory& o_q, occa::memory& o_rhs, const dfloat time) { - LIBP_ABORT(string("rhs_imex_f not implemented in this solver")) + virtual void rhs_imex_f(deviceMemory& o_q, deviceMemory& o_rhs, const dfloat time) { + LIBP_FORCE_ABORT("rhs_imex_f not implemented in this solver"); } // Partial rhs evaluation of g with solver in form dq/dt = f(q,t) + g(q,t) - virtual void rhs_imex_g(occa::memory& o_q, occa::memory& o_rhs, const dfloat time) { - LIBP_ABORT(string("rhs_imex_g not implemented in this solver")) + virtual void rhs_imex_g(deviceMemory& o_q, deviceMemory& o_rhs, const dfloat time) { + LIBP_FORCE_ABORT("rhs_imex_g not implemented in this solver"); } // Inversion of g function with solver in form dq/dt = f(q,t) + g(q,t) // Solves gamma*q - g(q,t) = rhs for q - virtual void rhs_imex_invg(occa::memory& o_rhs, occa::memory& o_q, const dfloat gamma, const dfloat time) { - LIBP_ABORT(string("rhs_imex_invg not implemented in this solver")) + virtual void rhs_imex_invg(deviceMemory& o_rhs, deviceMemory& o_q, const dfloat gamma, const dfloat time) { + LIBP_FORCE_ABORT("rhs_imex_invg not implemented in this solver"); } // Evolve rhs f function via a sub-timestepper - virtual void rhs_subcycle_f(occa::memory& o_Q, occa::memory& o_QHAT, - const dfloat T, const dfloat dt, const dfloat* B, + virtual void rhs_subcycle_f(deviceMemory& o_Q, deviceMemory& o_QHAT, + const dfloat T, const dfloat dt, const memory B, const int order, const int shiftIndex, const int maxOrder) { - LIBP_ABORT(string("Subcycling not implemented in this solver")) + LIBP_FORCE_ABORT("Subcycling not implemented in this solver"); } //Full rhs evaluation of solver in form dq/dt = rhsf(q,t) for multi-rate timestepping - virtual void rhsf_MR(occa::memory& o_q, occa::memory& o_rhs, occa::memory& o_fQM, const dfloat time, const int level) { - LIBP_ABORT(string("rhsf_MR not implemented in this solver")) + virtual void rhsf_MR(deviceMemory& o_q, deviceMemory& o_rhs, deviceMemory& o_fQM, const dfloat time, const int level) { + LIBP_FORCE_ABORT("rhsf_MR not implemented in this solver"); } //Full rhs evaluation of solver in form dq/dt = rhsf(q,t) with a perfectly matched layer (PML) - virtual void rhsf_pml(occa::memory& o_q, occa::memory& o_pmlq, - occa::memory& o_rhs, occa::memory& o_pmlrhs, const dfloat time) { - LIBP_ABORT(string("rhsf_pml not implemented in this solver")) + virtual void rhsf_pml(deviceMemory& o_q, deviceMemory& o_pmlq, + deviceMemory& o_rhs, deviceMemory& o_pmlrhs, const dfloat time) { + LIBP_FORCE_ABORT("rhsf_pml not implemented in this solver"); } //Full rhs evaluation of solver in form dq/dt = rhsf(q,t) for multi-rate timestepping with a PML - virtual void rhsf_MR_pml(occa::memory& o_q, occa::memory& o_pmlq, - occa::memory& o_rhs, occa::memory& o_pmlrhs, - occa::memory& o_fQM, const dfloat time, const int level) { - LIBP_ABORT(string("rhsf_MR_pml not implemented in this solver")) + virtual void rhsf_MR_pml(deviceMemory& o_q, deviceMemory& o_pmlq, + deviceMemory& o_rhs, deviceMemory& o_pmlrhs, + deviceMemory& o_fQM, const dfloat time, const int level) { + LIBP_FORCE_ABORT("rhsf_MR_pml not implemented in this solver"); } //Evaluation of solver as a operator in the form A(q) - virtual void Operator(occa::memory& o_q, occa::memory& o_Aq) { - LIBP_ABORT(string("Operator not implemented in this solver")) + virtual void Operator(deviceMemory& o_q, deviceMemory& o_Aq) { + LIBP_FORCE_ABORT("Operator not implemented in this solver"); } }; -#endif \ No newline at end of file +} //namespace libp + +#endif diff --git a/include/timeStepper.hpp b/include/timeStepper.hpp index 153b21561..f8d5e79b3 100644 --- a/include/timeStepper.hpp +++ b/include/timeStepper.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -32,101 +32,140 @@ SOFTWARE. #include "mesh.hpp" #include "solver.hpp" +namespace libp { + +//forward declare +namespace TimeStepper { class timeStepperBase_t; } + +/* General TimeStepper object*/ +class timeStepper_t { + public: + timeStepper_t() = default; + + /*Generic setup. Create a Stepper object and wrap it in a shared_ptr*/ + template + void Setup(Args&& ... args) { + ts = std::make_shared(args...); + } + + void Run(solver_t& solver, deviceMemory& o_q, dfloat start, dfloat end); + + void SetTimeStep(dfloat dt_); + + dfloat GetTimeStep(); + + dfloat GetGamma(); + + private: + std::shared_ptr ts=nullptr; + + void assertInitialized(); +}; + namespace TimeStepper { //base time stepper class -class timeStepper_t { +class timeStepperBase_t { public: + platform_t platform; + comm_t comm; + dlong N; dlong Nhalo; - solver_t& solver; - dfloat dt; - timeStepper_t(dlong Nelements, dlong NhaloElements, - int Np, int Nfields, solver_t& _solver): + timeStepperBase_t(dlong Nelements, dlong NhaloElements, + int Np, int Nfields, + platform_t& _platform, comm_t _comm): + platform(_platform), + comm(_comm), N(Nelements*Np*Nfields), - Nhalo(NhaloElements*Np*Nfields), - solver(_solver) {} + Nhalo(NhaloElements*Np*Nfields) {} - virtual ~timeStepper_t() {}; - virtual void Run(occa::memory& o_q, dfloat start, dfloat end)=0; + virtual void Run(solver_t& solver, deviceMemory& o_q, dfloat start, dfloat end)=0; void SetTimeStep(dfloat dt_) {dt = dt_;}; + dfloat GetTimeStep() {return dt;}; + + virtual dfloat GetGamma() { + LIBP_FORCE_ABORT("GetGamma() not available in this Timestepper"); + return 0.0; + } }; /* Adams Bashforth, order 3 */ -class ab3: public timeStepper_t { +class ab3: public timeStepperBase_t { protected: int Nstages; int shiftIndex; - dfloat *ab_a; - occa::memory o_ab_a; + memory ab_a; + deviceMemory o_ab_a; - occa::memory o_rhsq; + deviceMemory o_rhsq; - occa::kernel updateKernel; + kernel_t updateKernel; - virtual void Step(occa::memory& o_q, dfloat time, dfloat dt, int order); + virtual void Step(solver_t& solver, deviceMemory& o_q, dfloat time, dfloat dt, int order); public: ab3(dlong Nelements, dlong NhaloElements, - int Np, int Nfields, solver_t& solver); - ~ab3(); + int Np, int Nfields, + platform_t& _platform, comm_t _comm); - void Run(occa::memory& o_q, dfloat start, dfloat end); + void Run(solver_t& solver, deviceMemory& o_q, dfloat start, dfloat end); }; /* Low-Storage Explicit Runge-Kutta, order 4 */ -class lserk4: public timeStepper_t { +class lserk4: public timeStepperBase_t { protected: int Nrk; - dfloat *rka, *rkb, *rkc; + memory rka, rkb, rkc; + + deviceMemory o_rhsq; + deviceMemory o_resq; - occa::memory o_rhsq; - occa::memory o_resq; + deviceMemory o_saveq; - occa::kernel updateKernel; + kernel_t updateKernel; - virtual void Step(occa::memory& o_q, dfloat time, dfloat dt); + virtual void Step(solver_t& solver, deviceMemory& o_q, dfloat time, dfloat dt); public: lserk4(dlong Nelements, dlong NhaloElements, - int Np, int Nfields, solver_t& solver); - ~lserk4(); + int Np, int Nfields, + platform_t& _platform, comm_t _comm); - void Run(occa::memory& o_q, dfloat start, dfloat end); + void Run(solver_t& solver, deviceMemory& o_q, dfloat start, dfloat end); }; /* Dormand-Prince method */ /* Explict Runge-Kutta, order 5 with embedded order 4 and adaptive time-stepping */ -class dopri5: public timeStepper_t { +class dopri5: public timeStepperBase_t { protected: - MPI_Comm comm; int Nrk; dlong Nblock; - dfloat *rkC, *rkA, *rkE; - occa::memory o_rkA, o_rkE; + memory rkC, rkA, rkE; + deviceMemory o_rkA, o_rkE; - dfloat *errtmp; - occa::memory o_errtmp, h_errtmp; + deviceMemory o_errtmp; + pinnedMemory h_errtmp; - occa::memory o_rhsq; - occa::memory o_rkq; - occa::memory o_rkrhsq; - occa::memory o_rkerr; + deviceMemory o_rhsq; + deviceMemory o_rkq; + deviceMemory o_rkrhsq; + deviceMemory o_rkerr; - occa::memory o_saveq; + deviceMemory o_saveq; - occa::kernel rkUpdateKernel; - occa::kernel rkStageKernel; - occa::kernel rkErrorEstimateKernel; + kernel_t rkUpdateKernel; + kernel_t rkStageKernel; + kernel_t rkErrorEstimateKernel; dfloat dtMIN; //minumum allowed timestep dfloat ATOL; //absolute error tolerance @@ -144,24 +183,24 @@ class dopri5: public timeStepper_t { dfloat facold; dfloat sqrtinvNtotal; - virtual void Backup(occa::memory &o_Q); - virtual void Restore(occa::memory &o_Q); - virtual void AcceptStep(occa::memory &o_q, occa::memory &o_rq); + virtual void Backup(deviceMemory &o_Q); + virtual void Restore(deviceMemory &o_Q); + virtual void AcceptStep(deviceMemory &o_q, deviceMemory &o_rq); - virtual void Step(occa::memory& o_q, dfloat time, dfloat dt); + virtual void Step(solver_t& solver, deviceMemory& o_q, dfloat time, dfloat dt); - virtual dfloat Estimater(occa::memory& o_q); + virtual dfloat Estimater(deviceMemory& o_q); public: dopri5(dlong Nelements, dlong NhaloElements, - int Np, int Nfields, solver_t& solver, MPI_Comm _comm); - ~dopri5(); + int Np, int Nfields, + platform_t& _platform, comm_t _comm); - void Run(occa::memory& o_q, dfloat start, dfloat end); + void Run(solver_t& solver, deviceMemory& o_q, dfloat start, dfloat end); }; /* Semi-Analytic Adams-Bashforth, order 3 */ -class saab3: public timeStepper_t { +class saab3: public timeStepperBase_t { protected: int Nstages; int shiftIndex; @@ -169,59 +208,56 @@ class saab3: public timeStepper_t { int Np, Nfields; dlong Nblock, Nelements, NhaloElements; - dfloat *lambda; + memory lambda; - dfloat *saab_x, *saab_a; - occa::memory o_saab_x, o_saab_a; + pinnedMemory h_saab_x, h_saab_a; + deviceMemory o_saab_x, o_saab_a; - occa::memory o_rhsq; + deviceMemory o_rhsq; - occa::kernel updateKernel; + kernel_t updateKernel; - virtual void Step(occa::memory& o_q, dfloat time, dfloat dt, int order); + virtual void Step(solver_t& solver, deviceMemory& o_q, dfloat time, dfloat dt, int order); virtual void UpdateCoefficients(); public: saab3(dlong _Nelements, dlong _NhaloElements, int _Np, int _Nfields, - dfloat *_lambda, - solver_t& _solver); - ~saab3(); + memory _lambda, + platform_t& _platform, comm_t _comm); - void Run(occa::memory& o_q, dfloat start, dfloat end); + void Run(solver_t& solver, deviceMemory& o_q, dfloat start, dfloat end); }; /* Semi-Analytic Explict Runge-Kutta, order 4 with embedded order 3 and adaptive time-stepping */ -class sark4: public timeStepper_t { +class sark4: public timeStepperBase_t { protected: - MPI_Comm comm; int Nrk; int order, embeddedOrder; int Np, Nfields; dlong Nblock, Nelements, NhaloElements; - dfloat *lambda; - - dfloat *rkC, *rkX, *rkA, *rkE; - occa::memory h_rkX, h_rkA, h_rkE; - occa::memory o_rkX, o_rkA, o_rkE; + memory lambda; - dfloat *errtmp; + memory rkC; + deviceMemory o_rkX, o_rkA, o_rkE; + pinnedMemory h_rkX, h_rkA, h_rkE; - occa::memory o_rhsq; - occa::memory o_rkq; - occa::memory o_rkrhsq; - occa::memory o_rkerr; + deviceMemory o_rhsq; + deviceMemory o_rkq; + deviceMemory o_rkrhsq; + deviceMemory o_rkerr; - occa::memory o_saveq; + deviceMemory o_saveq; - occa::memory o_errtmp; + deviceMemory o_errtmp; + pinnedMemory h_errtmp; - occa::kernel rkUpdateKernel; - occa::kernel rkStageKernel; - occa::kernel rkErrorEstimateKernel; + kernel_t rkUpdateKernel; + kernel_t rkStageKernel; + kernel_t rkErrorEstimateKernel; dfloat dtMIN; //minumum allowed timestep dfloat ATOL; //absolute error tolerance @@ -239,56 +275,54 @@ class sark4: public timeStepper_t { dfloat facold; dfloat sqrtinvNtotal; - virtual void Backup(occa::memory &o_Q); - virtual void Restore(occa::memory &o_Q); - virtual void AcceptStep(occa::memory &o_q, occa::memory &o_rq); + virtual void Backup(deviceMemory &o_Q); + virtual void Restore(deviceMemory &o_Q); + virtual void AcceptStep(deviceMemory &o_q, deviceMemory &o_rq); - virtual void Step(occa::memory& o_q, dfloat time, dfloat dt); + virtual void Step(solver_t& solver, deviceMemory& o_q, dfloat time, dfloat dt); - dfloat Estimater(occa::memory& o_q); + dfloat Estimater(deviceMemory& o_q); void UpdateCoefficients(); public: sark4(dlong _Nelements, dlong _NhaloElements, int _Np, int _Nfields, - dfloat *_lambda, - solver_t& _solver, MPI_Comm _comm); - ~sark4(); + memory _lambda, + platform_t& _platform, comm_t _comm); - void Run(occa::memory& o_q, dfloat start, dfloat end); + void Run(solver_t& solver, deviceMemory& o_q, dfloat start, dfloat end); }; /* Semi-Analytic Explict Runge-Kutta, order 5 with embedded order 4 and adaptive time-stepping */ -class sark5: public timeStepper_t { +class sark5: public timeStepperBase_t { protected: - MPI_Comm comm; int Nrk; int order, embeddedOrder; int Np, Nfields; dlong Nblock, Nelements, NhaloElements; - dfloat *lambda; + memory lambda; - dfloat *rkC, *rkX, *rkA, *rkE; - occa::memory h_rkX, h_rkA, h_rkE; - occa::memory o_rkX, o_rkA, o_rkE; + memory rkC; + deviceMemory o_rkX, o_rkA, o_rkE; + pinnedMemory h_rkX, h_rkA, h_rkE; - dfloat *errtmp; - occa::memory o_rhsq; - occa::memory o_rkq; - occa::memory o_rkrhsq; - occa::memory o_rkerr; + deviceMemory o_rhsq; + deviceMemory o_rkq; + deviceMemory o_rkrhsq; + deviceMemory o_rkerr; - occa::memory o_saveq; + deviceMemory o_saveq; - occa::memory o_errtmp; + deviceMemory o_errtmp; + pinnedMemory h_errtmp; - occa::kernel rkUpdateKernel; - occa::kernel rkStageKernel; - occa::kernel rkErrorEstimateKernel; + kernel_t rkUpdateKernel; + kernel_t rkStageKernel; + kernel_t rkErrorEstimateKernel; dfloat dtMIN; //minumum allowed timestep dfloat ATOL; //absolute error tolerance @@ -306,154 +340,151 @@ class sark5: public timeStepper_t { dfloat facold; dfloat sqrtinvNtotal; - virtual void Backup(occa::memory &o_Q); - virtual void Restore(occa::memory &o_Q); - virtual void AcceptStep(occa::memory &o_q, occa::memory &o_rq); + virtual void Backup(deviceMemory &o_Q); + virtual void Restore(deviceMemory &o_Q); + virtual void AcceptStep(deviceMemory &o_q, deviceMemory &o_rq); - virtual void Step(occa::memory& o_q, dfloat time, dfloat dt); + virtual void Step(solver_t& solver, deviceMemory& o_q, dfloat time, dfloat dt); - dfloat Estimater(occa::memory& o_q); + dfloat Estimater(deviceMemory& o_q); void UpdateCoefficients(); public: sark5(dlong _Nelements, dlong _NhaloElements, int _Np, int _Nfields, - dfloat *_lambda, - solver_t& _solver, MPI_Comm _comm); - ~sark5(); + memory _lambda, + platform_t& _platform, comm_t _comm); - void Run(occa::memory& o_q, dfloat start, dfloat end); + void Run(solver_t& solver, deviceMemory& o_q, dfloat start, dfloat end); }; /* Backward Difference Formula, order 3, with extrapolation */ -class extbdf3: public timeStepper_t { +class extbdf3: public timeStepperBase_t { protected: int Nstages; int shiftIndex; - dfloat *extbdf_a; - dfloat *extbdf_b; - occa::memory o_extbdf_a; - occa::memory o_extbdf_b; + memory extbdf_a; + memory extbdf_b; + deviceMemory o_extbdf_a; + deviceMemory o_extbdf_b; - occa::memory o_rhs; - occa::memory o_qn; - occa::memory o_F; + deviceMemory o_rhs; + deviceMemory o_qn; + deviceMemory o_F; - occa::kernel rhsKernel; + kernel_t rhsKernel; - virtual void Step(occa::memory& o_q, dfloat time, dfloat dt, int order); + virtual void Step(solver_t& solver, deviceMemory& o_q, dfloat time, dfloat dt, int order); public: extbdf3(dlong Nelements, dlong NhaloElements, - int Np, int Nfields, solver_t& solver); - ~extbdf3(); + int Np, int Nfields, + platform_t& _platform, comm_t _comm); - dfloat getGamma(); + dfloat GetGamma(); - void Run(occa::memory& o_q, dfloat start, dfloat end); + void Run(solver_t& solver, deviceMemory& o_q, dfloat start, dfloat end); }; /* Backward Difference Formula, order 3, with subcycling */ -class ssbdf3: public timeStepper_t { +class ssbdf3: public timeStepperBase_t { protected: int Nstages; int shiftIndex; - dfloat *ssbdf_b; - occa::memory o_ssbdf_b; + memory ssbdf_b; + deviceMemory o_ssbdf_b; - occa::memory o_rhs; - occa::memory o_qn; - occa::memory o_qhat; + deviceMemory o_rhs; + deviceMemory o_qn; + deviceMemory o_qhat; - occa::kernel rhsKernel; + kernel_t rhsKernel; - virtual void Step(occa::memory& o_q, dfloat time, dfloat dt, int order); + virtual void Step(solver_t& solver, deviceMemory& o_q, dfloat time, dfloat dt, int order); public: ssbdf3(dlong Nelements, dlong NhaloElements, - int Np, int Nfields, solver_t& solver); - ~ssbdf3(); + int Np, int Nfields, + platform_t& _platform, comm_t _comm); - dfloat getGamma(); + dfloat GetGamma(); - void Run(occa::memory& o_q, dfloat start, dfloat end); + void Run(solver_t& solver, deviceMemory& o_q, dfloat start, dfloat end); }; /* Multi-rate Adams-Bashforth, order 3 */ -class mrab3: public timeStepper_t { +class mrab3: public timeStepperBase_t { protected: - mesh_t &mesh; + mesh_t mesh; int Nstages; int Nlevels; int Nfields; - int* shiftIndex; - occa::memory o_shiftIndex, h_shiftIndex; + deviceMemory o_shiftIndex; + deviceMemory h_shiftIndex; - dfloat *mrdt; - occa::memory o_mrdt; + memory mrdt; + deviceMemory o_mrdt; - dfloat *ab_a, *ab_b; - occa::memory o_ab_a, o_ab_b; + memory ab_a, ab_b; + deviceMemory o_ab_a, o_ab_b; - occa::memory o_rhsq0, o_rhsq, o_fQM; + deviceMemory o_rhsq0, o_rhsq, o_fQM; - occa::kernel updateKernel; - occa::kernel traceUpdateKernel; + kernel_t updateKernel; + kernel_t traceUpdateKernel; - virtual void Step(occa::memory& o_q, dfloat time, dfloat dt, int order); + virtual void Step(solver_t& solver, deviceMemory& o_q, dfloat time, dfloat dt, int order); public: mrab3(dlong _Nelements, dlong _NhaloElements, int _Np, int _Nfields, - solver_t& _solver, mesh_t& _mesh); - ~mrab3(); + platform_t& _platform, mesh_t& _mesh); - void Run(occa::memory& o_q, dfloat start, dfloat end); + void Run(solver_t& solver, deviceMemory& o_q, dfloat start, dfloat end); }; /* Multi-rate Semi-Analytic Adams-Bashforth, order 3 */ -class mrsaab3: public timeStepper_t { +class mrsaab3: public timeStepperBase_t { protected: - mesh_t &mesh; + mesh_t mesh; int Nstages; int Nlevels; int Nfields; - dfloat *lambda; + memory lambda; - int* shiftIndex; - occa::memory o_shiftIndex, h_shiftIndex; + deviceMemory o_shiftIndex; + pinnedMemory h_shiftIndex; - dfloat *mrdt; - occa::memory o_mrdt; + memory mrdt; + deviceMemory o_mrdt; - dfloat *saab_x, *saab_a, *saab_b; - occa::memory o_saab_x, o_saab_a, o_saab_b; + memory saab_x, saab_a, saab_b; + deviceMemory o_saab_x, o_saab_a, o_saab_b; - occa::memory o_rhsq0, o_rhsq, o_fQM; + deviceMemory o_rhsq0, o_rhsq, o_fQM; - occa::kernel updateKernel; - occa::kernel traceUpdateKernel; + kernel_t updateKernel; + kernel_t traceUpdateKernel; - virtual void Step(occa::memory& o_q, dfloat time, dfloat dt, int order); + virtual void Step(solver_t& solver, deviceMemory& o_q, dfloat time, dfloat dt, int order); void UpdateCoefficients(); public: mrsaab3(dlong _Nelements, dlong _NhaloElements, int _Np, int _Nfields, - dfloat *_lambda, - solver_t& _solver, mesh_t& _mesh); - ~mrsaab3(); + memory _lambda, + platform_t& _platform, mesh_t& _mesh); void Init(); - void Run(occa::memory& o_q, dfloat start, dfloat end); + void Run(solver_t& solver, deviceMemory& o_q, dfloat start, dfloat end); }; @@ -466,15 +497,15 @@ class ab3_pml: public ab3 { private: dlong Npml; - occa::memory o_pmlq; - occa::memory o_rhspmlq; + deviceMemory o_pmlq; + deviceMemory o_rhspmlq; - void Step(occa::memory& o_q, dfloat time, dfloat dt, int order); + void Step(solver_t& solver, deviceMemory& o_q, dfloat time, dfloat dt, int order); public: ab3_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElements, - int Np, int Nfields, int Npmlfields, solver_t& solver); - ~ab3_pml(); + int Np, int Nfields, int Npmlfields, + platform_t& _platform, comm_t _comm); }; /* Low-Storage Explicit Runge-Kutta, order 4 */ @@ -482,16 +513,16 @@ class lserk4_pml: public lserk4 { private: dlong Npml; - occa::memory o_pmlq; - occa::memory o_rhspmlq; - occa::memory o_respmlq; + deviceMemory o_pmlq; + deviceMemory o_rhspmlq; + deviceMemory o_respmlq; - void Step(occa::memory& o_q, dfloat time, dfloat dt); + void Step(solver_t& solver, deviceMemory& o_q, dfloat time, dfloat dt); public: lserk4_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElements, - int Np, int Nfields, int Npmlfields, solver_t& solver); - ~lserk4_pml(); + int Np, int Nfields, int Npmlfields, + platform_t& _platform, comm_t _comm); }; /* Dormand-Prince method */ @@ -500,25 +531,25 @@ class dopri5_pml: public dopri5 { private: dlong Npml; - occa::memory o_pmlq; - occa::memory o_rhspmlq; - occa::memory o_rkpmlq; - occa::memory o_rkrhspmlq; + deviceMemory o_pmlq; + deviceMemory o_rhspmlq; + deviceMemory o_rkpmlq; + deviceMemory o_rkrhspmlq; - occa::memory o_savepmlq; + deviceMemory o_savepmlq; - occa::kernel rkPmlUpdateKernel; + kernel_t rkPmlUpdateKernel; - void Backup(occa::memory &o_Q); - void Restore(occa::memory &o_Q); - void AcceptStep(occa::memory &o_q, occa::memory &o_rq); + void Backup(deviceMemory &o_Q); + void Restore(deviceMemory &o_Q); + void AcceptStep(deviceMemory &o_q, deviceMemory &o_rq); - void Step(occa::memory& o_q, dfloat time, dfloat dt); + void Step(solver_t& solver, deviceMemory& o_q, dfloat time, dfloat dt); public: dopri5_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElements, - int Np, int Nfields, int Npmlfields, solver_t& solver, MPI_Comm _comm); - ~dopri5_pml(); + int Np, int Nfields, int Npmlfields, + platform_t& _platform, comm_t _comm); }; /* Semi-Analytic Adams-Bashforth, order 3 */ @@ -527,21 +558,21 @@ class saab3_pml: public saab3 { private: dlong Npml; - dfloat *pmlsaab_x, *pmlsaab_a; - occa::memory o_pmlsaab_x, o_pmlsaab_a; + memory pmlsaab_x, pmlsaab_a; + deviceMemory o_pmlsaab_x, o_pmlsaab_a; - occa::memory o_pmlq; - occa::memory o_rhspmlq; + deviceMemory o_pmlq; + deviceMemory o_rhspmlq; - occa::kernel pmlUpdateKernel; + kernel_t pmlUpdateKernel; - void Step(occa::memory& o_q, dfloat time, dfloat dt, int order); + void Step(solver_t& solver, deviceMemory& o_q, dfloat time, dfloat dt, int order); public: saab3_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElements, int Np, int Nfields, int _Npmlfields, - dfloat *_lambda, solver_t& solver); - ~saab3_pml(); + memory _lambda, + platform_t& _platform, comm_t _comm); }; /* Semi-Analytic Explict Runge-Kutta, order 4 with embedded order 3 and adaptive time-stepping */ @@ -550,30 +581,29 @@ class sark4_pml: public sark4 { private: dlong Npml; - dfloat *pmlrkA; - occa::memory o_pmlrkA; + memory pmlrkA; + deviceMemory o_pmlrkA; - occa::memory o_pmlq; - occa::memory o_rhspmlq; - occa::memory o_rkpmlq; - occa::memory o_rkrhspmlq; + deviceMemory o_pmlq; + deviceMemory o_rhspmlq; + deviceMemory o_rkpmlq; + deviceMemory o_rkrhspmlq; - occa::memory o_savepmlq; + deviceMemory o_savepmlq; - occa::kernel rkPmlUpdateKernel; - occa::kernel rkPmlStageKernel; + kernel_t rkPmlUpdateKernel; + kernel_t rkPmlStageKernel; - void Backup(occa::memory &o_Q); - void Restore(occa::memory &o_Q); - void AcceptStep(occa::memory &o_q, occa::memory &o_rq); + void Backup(deviceMemory &o_Q); + void Restore(deviceMemory &o_Q); + void AcceptStep(deviceMemory &o_q, deviceMemory &o_rq); - void Step(occa::memory& o_q, dfloat time, dfloat dt); + void Step(solver_t& solver, deviceMemory& o_q, dfloat time, dfloat dt); public: sark4_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElements, int Np, int Nfields, int _Npmlfields, - dfloat *_lambda, solver_t& solver, MPI_Comm _comm); - ~sark4_pml(); + memory _lambda, platform_t& _platform, comm_t _comm); }; /* Semi-Analytic Explict Runge-Kutta, order 5 with embedded order 4 and adaptive time-stepping */ @@ -582,30 +612,29 @@ class sark5_pml: public sark5 { private: dlong Npml; - dfloat *pmlrkA; - occa::memory o_pmlrkA; + memory pmlrkA; + deviceMemory o_pmlrkA; - occa::memory o_pmlq; - occa::memory o_rhspmlq; - occa::memory o_rkpmlq; - occa::memory o_rkrhspmlq; + deviceMemory o_pmlq; + deviceMemory o_rhspmlq; + deviceMemory o_rkpmlq; + deviceMemory o_rkrhspmlq; - occa::memory o_savepmlq; + deviceMemory o_savepmlq; - occa::kernel rkPmlUpdateKernel; - occa::kernel rkPmlStageKernel; + kernel_t rkPmlUpdateKernel; + kernel_t rkPmlStageKernel; - void Backup(occa::memory &o_Q); - void Restore(occa::memory &o_Q); - void AcceptStep(occa::memory &o_q, occa::memory &o_rq); + void Backup(deviceMemory &o_Q); + void Restore(deviceMemory &o_Q); + void AcceptStep(deviceMemory &o_q, deviceMemory &o_rq); - void Step(occa::memory& o_q, dfloat time, dfloat dt); + void Step(solver_t& solver, deviceMemory& o_q, dfloat time, dfloat dt); public: sark5_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElements, int Np, int Nfields, int _Npmlfields, - dfloat *_lambda, solver_t& solver, MPI_Comm _comm); - ~sark5_pml(); + memory _lambda, platform_t& _platform, comm_t _comm); }; @@ -615,17 +644,16 @@ class mrab3_pml: public mrab3 { dlong Npml; int Npmlfields; - occa::memory o_pmlq; - occa::memory o_rhspmlq0, o_rhspmlq; + deviceMemory o_pmlq; + deviceMemory o_rhspmlq0, o_rhspmlq; - occa::kernel pmlUpdateKernel; + kernel_t pmlUpdateKernel; - void Step(occa::memory& o_q, dfloat time, dfloat dt, int order); + void Step(solver_t& solver, deviceMemory& o_q, dfloat time, dfloat dt, int order); public: mrab3_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElements, - int Np, int Nfields, int _Npmlfields, solver_t& solver, mesh_t& _mesh); - ~mrab3_pml(); + int Np, int Nfields, int _Npmlfields, platform_t& _platform, mesh_t& _mesh); }; /* Multi-rate Semi-Analytic Adams-Bashforth, order 3 */ @@ -635,23 +663,25 @@ class mrsaab3_pml: public mrsaab3 { dlong Npml; int Npmlfields; - occa::memory o_pmlq; - dfloat *pmlsaab_a, *pmlsaab_b; - occa::memory o_pmlsaab_a, o_pmlsaab_b; + deviceMemory o_pmlq; - occa::memory o_rhspmlq0, o_rhspmlq; + memory pmlsaab_a, pmlsaab_b; + deviceMemory o_pmlsaab_a, o_pmlsaab_b; - occa::kernel pmlUpdateKernel; + deviceMemory o_rhspmlq0, o_rhspmlq; - void Step(occa::memory& o_q, dfloat time, dfloat dt, int order); + kernel_t pmlUpdateKernel; + + void Step(solver_t& solver, deviceMemory& o_q, dfloat time, dfloat dt, int order); public: mrsaab3_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElements, int Np, int Nfields, int _Npmlfields, - dfloat *_lambda, solver_t& solver, mesh_t& _mesh); - ~mrsaab3_pml(); + memory _lambda, platform_t& _platform, mesh_t& _mesh); }; } //namespace TimeStepper +} //namespace libp + #endif diff --git a/include/timer.hpp b/include/timer.hpp new file mode 100644 index 000000000..cf5a22c2a --- /dev/null +++ b/include/timer.hpp @@ -0,0 +1,59 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#ifndef LIBP_TIMER_HPP +#define LIBP_TIMER_HPP + +#include "core.hpp" +#include "comm.hpp" +#include "platform.hpp" +#include + +namespace libp { + +using timePoint_t = std::chrono::time_point; + +/* Host time*/ +timePoint_t Time(); + +/* Host time after global sync*/ +timePoint_t GlobalTime(comm_t comm); + +/* Host time after platform sync*/ +timePoint_t PlatformTime(platform_t &platform); + +/* Host time after platform sync*/ +timePoint_t GlobalPlatformTime(platform_t &platform); + +/* Host time after platform sync*/ +timePoint_t GlobalPlatformTime(platform_t &platform, comm_t comm); + +/*Time between time points, in seconds*/ +double ElapsedTime(const timePoint_t start, const timePoint_t end); + +} //namespace libp + +#endif diff --git a/include/types.h b/include/types.h index 45f3bd372..b801ea38a 100644 --- a/include/types.h +++ b/include/types.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -30,24 +30,18 @@ SOFTWARE. // precision of AMG storage #if 0 #define pfloat float -#define ogs_pfloat ogs_float #else #define pfloat double -#define ogs_pfloat ogs_double #endif //float data type #if 0 #define dfloat float -#define ogs_dfloat ogs_float -#define MPI_DFLOAT MPI_FLOAT #define dfloatFormat "%f" #define dfloatString "float" #else #define dfloat double -#define ogs_dfloat ogs_double -#define MPI_DFLOAT MPI_DOUBLE #define dfloatFormat "%lf" #define dfloatString "double" #endif @@ -55,14 +49,10 @@ SOFTWARE. //host index data type #if 0 #define hlong int -#define ogs_hlong ogs_int -#define MPI_HLONG MPI_INT #define hlongFormat "%d" #define hlongString "int" #else #define hlong long long int -#define ogs_hlong ogs_long_long -#define MPI_HLONG MPI_LONG_LONG_INT #define hlongFormat "%lld" #define hlongString "long long int" #endif @@ -70,14 +60,10 @@ SOFTWARE. //device index data type #if 1 #define dlong int -#define ogs_dlong ogs_int -#define MPI_DLONG MPI_INT #define dlongFormat "%d" #define dlongString "int" #else #define dlong long long int -#define ogs_dlong ogs_longlongint -#define MPI_DLONG MPI_LONG_LONG_INT #define dlongFormat "%lld" #define dlongString "long long int" #endif diff --git a/include/utils.hpp b/include/utils.hpp index 842066bbb..8d0ec5634 100644 --- a/include/utils.hpp +++ b/include/utils.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,11 +27,25 @@ SOFTWARE. #ifndef UTILS_HPP #define UTILS_HPP -#include -#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include "types.h" +namespace libp { + +using properties_t = occa::json; +using device_t = occa::device; +using kernel_t = occa::kernel; +using stream_t = occa::stream; + //error codes #define LIBP_SUCCESS 0 #define LIBP_ERROR -1 @@ -40,37 +54,62 @@ SOFTWARE. # define __PRETTY_FUNCTION__ __FUNCTION__ #endif -#define LIBP_ABORT2(filename, function, line, message) \ - { \ - std::string banner = "---[ Error ]"; \ - std::cerr << '\n' \ - << std::string(74, '=') << '\n' \ - << banner << std::string(74 - banner.size(), '-') << '\n' \ - << " File : " << filename << '\n' \ - << " Line : " << line << '\n' \ - << " Function : " << function << '\n' \ - << " Message : " << message << '\n' \ - << std::string(74, '=') << '\n'; \ - MPI_Abort(MPI_COMM_WORLD,LIBP_ERROR); \ - } -#define LIBP_ABORT(message) LIBP_ABORT2(__FILE__, __PRETTY_FUNCTION__, __LINE__, message) - -#define LIBP_WARNING(message) \ - { \ - std::string banner = "---[ Warning ]"; \ - std::cerr << '\n' \ - << std::string(74, '=') << '\n' \ - << banner << std::string(74 - banner.size(), '-') << '\n' \ - << " " << message << '\n' \ - << std::string(74, '=') << '\n'; \ - } - -#define mymax(a,b) (((a)>(b))?(a):(b)) -#define mymin(a,b) (((a)<(b))?(a):(b)) - -// block size for reduction (hard coded) -#define BLOCKSIZE 256 - - +#define LIBP_TEMPLATE_CHECK(checkFunction, expr, filename, function, line, message) \ + do { \ + const bool isErr = (bool) (expr); \ + if (isErr) { \ + std::stringstream _check_ss; \ + _check_ss << message; \ + checkFunction(filename, function, line, _check_ss.str()); \ + } \ + } while (false) + +#define LIBP_ABORT3(expr, filename, function, line, message) LIBP_TEMPLATE_CHECK(libp::abort, expr, filename, function, line, message) +#define LIBP_ABORT2(expr, filename, function, line, message) LIBP_ABORT3(expr, filename, function, line, message) +#define LIBP_ABORT(message, expr) LIBP_ABORT2(expr, __FILE__, __PRETTY_FUNCTION__, __LINE__, message) + +#define LIBP_WARNING3(expr, filename, function, line, message) LIBP_TEMPLATE_CHECK(libp::warn, expr, filename, function, line, message) +#define LIBP_WARNING2(expr, filename, function, line, message) LIBP_WARNING3(expr, filename, function, line, message) +#define LIBP_WARNING(message, expr) LIBP_WARNING2(expr, __FILE__, __PRETTY_FUNCTION__, __LINE__, message) + +#define LIBP_FORCE_ABORT(message) LIBP_ABORT(message, true) +#define LIBP_FORCE_WARNING(message) LIBP_WARNING(message, true) + +class exception : public std::exception { + public: + const std::string header; + const std::string filename; + const std::string function; + const std::string message; + const int line; + + std::string exceptionMessage; + + exception(const std::string &header_, + const std::string &filename_, + const std::string &function_, + const int line_, + const std::string &message_); + ~exception() throw(); + + const char* what() const throw(); + std::string toString() const; + std::string location() const; +}; + +std::ostream& operator << (std::ostream& out, + const exception &exc); + +void abort(const std::string &filename, + const std::string &function, + const int line, + const std::string &message); + +void warn(const std::string &filename, + const std::string &function, + const int line, + const std::string &message); + +} //namespace libp #endif diff --git a/libs/core/comm.cpp b/libs/core/comm.cpp new file mode 100644 index 000000000..f96af932e --- /dev/null +++ b/libs/core/comm.cpp @@ -0,0 +1,118 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "comm.hpp" + +namespace libp { + +namespace Comm { + +/*Static MPI_Init and MPI_Finalize*/ +void Init(int &argc, char** &argv) { MPI_Init(&argc, &argv); } +void Finalize() { MPI_Finalize(); } + +/*Static handle to MPI_COMM_WORLD*/ +comm_t World() { + comm_t c; + c.comm_ptr = std::make_shared(); + *(c.comm_ptr) = MPI_COMM_WORLD; + MPI_Comm_rank(c.comm(), &(c._rank)); + MPI_Comm_size(c.comm(), &(c._size)); + return c; +} + +void GetProcessorName(char* name, int &namelen) { + MPI_Get_processor_name(name,&namelen); +} + +} //namespace Comm + +/*MPI_Comm_dup and free*/ +comm_t comm_t::Dup() const { + comm_t c; + /*Make a new comm shared_ptr, which will call MPI_Comm_free when destroyed*/ + c.comm_ptr = std::shared_ptr(new MPI_Comm, + [](MPI_Comm *comm) { + if (*comm != MPI_COMM_NULL) + MPI_Comm_free(comm); + delete comm; + }); + MPI_Comm_dup(comm(), c.comm_ptr.get()); + MPI_Comm_rank(c.comm(), &(c._rank)); + MPI_Comm_size(c.comm(), &(c._size)); + return c; +} +void comm_t::Free() { + comm_ptr = nullptr; + _rank=0; + _size=0; +} +/*Split*/ +comm_t comm_t::Split(const int color, const int key) const { + comm_t c; + /*Make a new comm shared_ptr, which will call MPI_Comm_free when destroyed*/ + c.comm_ptr = std::shared_ptr(new MPI_Comm, + [](MPI_Comm *comm) { + if (*comm != MPI_COMM_NULL) + MPI_Comm_free(comm); + delete comm; + }); + + MPI_Comm_split(comm(), color, key, c.comm_ptr.get()); + MPI_Comm_rank(c.comm(), &(c._rank)); + MPI_Comm_size(c.comm(), &(c._size)); + return c; +} + +/*Rank and size getters*/ +const int comm_t::rank() const { + return _rank; +} +const int comm_t::size() const { + return _size; +} + +MPI_Comm comm_t::comm() const { + if (comm_ptr == nullptr) { + return MPI_COMM_NULL; + } else { + return *comm_ptr; + } +} + +void comm_t::Wait(Comm::request_t &request) const { + MPI_Wait(&request, MPI_STATUS_IGNORE); +} + +void comm_t::Waitall(const int count, memory &requests) const { + MPI_Waitall(count, requests.ptr(), MPI_STATUSES_IGNORE); +} + +void comm_t::Barrier() const { + MPI_Barrier(comm()); +} + +} //namespace libp diff --git a/libs/core/exception.cpp b/libs/core/exception.cpp new file mode 100644 index 000000000..2be2c7f12 --- /dev/null +++ b/libs/core/exception.cpp @@ -0,0 +1,90 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "utils.hpp" + +namespace libp { + + +exception::exception(const std::string &header_, + const std::string &filename_, + const std::string &function_, + const int line_, + const std::string &message_) : + header(header_), + filename(filename_), + function(function_), + message(message_), + line(line_), + exceptionMessage(toString()) {} + +exception::~exception() throw() {} + +const char* exception::what() const throw() { + return exceptionMessage.c_str(); +} + +std::string exception::toString() const { + std::stringstream ss; + std::string banner = "---[ " + header + " ]"; + ss << '\n' + << banner << std::string(80 - banner.size(), '-') << '\n' + << location() + << " Message : " << message << '\n' + << std::string(80, '=') << '\n'; + return ss.str(); +} + +std::string exception::location() const { + std::stringstream ss; + ss << " File : " << filename << '\n' + << " Line : " << line << '\n' + << " Function : " << function << '\n'; + return ss.str(); +} + +std::ostream& operator << (std::ostream& out, + const exception &exc) { + out << exc.toString() << std::flush; + return out; +} + +void abort(const std::string &filename, + const std::string &function, + const int line, + const std::string &message) { + throw exception("Error", filename, function, line, message); +} + +void warn(const std::string &filename, + const std::string &function, + const int line, + const std::string &message) { + exception exp("Warning", filename, function, line, message); + std::cout << exp; +} + +} //namespace libp diff --git a/libs/core/factor.cpp b/libs/core/factor.cpp deleted file mode 100644 index d61926f61..000000000 --- a/libs/core/factor.cpp +++ /dev/null @@ -1,89 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#include "core.hpp" - -// find a factorization n = nx*ny such that -// nx>=ny are 'close' to one another -void factor2(const int n, int &nx, int &ny) { - //start with guessing nx ~= n^1/2 - nx = round(sqrt(n)); - ny = 1; - - for (;nxnx) std::swap(nx,ny); - - return; - } - } - - //if we made it this far, n is prime - nx = n; -} - -// find a factorization n = nx*ny*nz such that -// nx>=ny>=nz are all 'close' to one another -void factor3(const int n, int &nx, int &ny, int &nz) { - //start with guessing nx ~= n^1/3 - nx = round(std::cbrt(n)); - ny = nz = 1; - - for (;nxnx) std::swap(nx,ny); - if (nz>ny) std::swap(ny,nz); - if (ny>nx) std::swap(nx,ny); - - return; - } - } - - //if we're here, f is prime - ny = f; - nz = 1; - - //swap if needed - if (ny>nx) std::swap(nx,ny); - - return; - } - } - - //if we made it this far, n is prime - nx = n; -} \ No newline at end of file diff --git a/libs/core/matrixEig.cpp b/libs/core/matrixEig.cpp deleted file mode 100644 index 075c5735b..000000000 --- a/libs/core/matrixEig.cpp +++ /dev/null @@ -1,178 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#include "core.hpp" - -extern "C" { - void sgeev_(char *JOBVL, char *JOBVR, int *N, float *A, int *LDA, float *WR, float *WI, - float *VL, int *LDVL, float *VR, int *LDVR, float *WORK, int *LWORK, int *INFO ); - void dgeev_(char *JOBVL, char *JOBVR, int *N, double *A, int *LDA, double *WR, double *WI, - double *VL, int *LDVL, double *VR, int *LDVR, double *WORK, int *LWORK, int *INFO ); -} - -// compute right eigenvectors -void matrixEigenVectors(int N, double *A, double *VR, double *WR, double *WI){ - - char JOBVL = 'N'; - char JOBVR = 'V'; - int LDA = N; - int LDVL = N; - int LDVR = N; - int LWORK = 8*N; - - double *VL = NULL; - double *WORK = (double*) calloc(LWORK,sizeof(double)); - - double *tmpA = (double*) calloc(N*N,sizeof(double)); - double *tmpVR = (double*) calloc(N*N,sizeof(double)); - - for(int n=0;nmesh2D::OccaSetup(); - - o_D = platform.malloc(Nq*Nq*sizeof(dfloat), D); - - o_S = o_D; //dummy - o_MM = o_D; //dummy - o_sM = o_D; //dummy - o_LIFT = o_D; //dummy - - o_vgeo = platform.malloc((Nelements+totalHaloPairs)*Nvgeo*Np*sizeof(dfloat), vgeo); - o_sgeo = platform.malloc(Nelements*Nfaces*Nfp*Nsgeo*sizeof(dfloat), sgeo); - o_ggeo = platform.malloc(Nelements*Np*Nggeo*sizeof(dfloat), ggeo); -} +#include "memory.hpp" + +namespace libp { +/*explicit instantiation of common specializations*/ +template class memory; +template class memory; +template class memory; +template class memory; + +/*explicit instantiation of common specializations*/ +template class deviceMemory; +template class deviceMemory; +template class deviceMemory; +template class deviceMemory; +} //namespace libp diff --git a/libs/core/parallelSort.cpp b/libs/core/parallelSort.cpp deleted file mode 100644 index 76e3b97a9..000000000 --- a/libs/core/parallelSort.cpp +++ /dev/null @@ -1,145 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -/* use this for int */ -#include "mesh.hpp" - -void mergeLists(size_t sz, - int N1, char *v1, - int N2, char *v2, - char *v3, - int (*compare)(const void *, const void *), - void (*match)(void *, void *)){ - - int n1 = 0, n2 = 0, n3 = 0; - - // merge two lists from v1 and v2 - for(n3=0;n30) - MPI_Irecv(A, NA*sz, MPI_CHAR, rank-1, tag, comm, &recvA); - - if(rank0) - MPI_Wait(&recvA, &status); - - /* merge sort A & B */ - if(rank>0) - mergeLists(sz, NA, (char*)A, NB, (char*)B, (char*)tmp, compare, match); - - /* send A, receive C */ - if(rank>0) - MPI_Isend(A, NA*sz, MPI_CHAR, rank-1, tag, comm, &sendA); - if(rank0) - MPI_Wait(&sendA, &status); - if(rank hostnames(size()*MAX_PROCESSOR_NAME); + memory hostname = hostnames + rank()*MAX_PROCESSOR_NAME; + + int namelen; + Comm::GetProcessorName(hostname.ptr(), namelen); + comm.Allgather(hostnames, MAX_PROCESSOR_NAME); + + int localRank = 0; + int localSize = 0; + for (int n=0; n0 && localRank>=deviceCount) { - stringstream ss; - ss << "Rank " << rank << " oversubscribing device " << device_id%deviceCount << " on node \"" << hostname<< "\""; - LIBP_WARNING(ss.str()); + LIBP_FORCE_WARNING("Rank " << rank() << " oversubscribing device " << device_id%deviceCount << " on node \"" << hostname.ptr() << "\""); device_id = device_id%deviceCount; } - MPI_Barrier(MPI_COMM_WORLD); - free(hostnames); } // add device_id to setup string @@ -101,18 +100,82 @@ void platform_t::DeviceConfig(){ mode += ", device_id: " + std::to_string(device_id) + "}"; } - //set number of omp threads to use - //int Ncores = sysconf(_SC_NPROCESSORS_ONLN); - //int Nthreads = Ncores/localSize; - // Nthreads = mymax(1,Nthreads/2); - // omp_set_num_threads(Nthreads); +#if !defined(LIBP_DEBUG) + /*set number of omp threads to use*/ + /*Use lscpu to determine core and socket counts */ + FILE *pipeCores = popen("lscpu | grep \"Core(s) per socket\" | awk '{print $4}'", "r"); + FILE *pipeSockets = popen("lscpu | grep \"Socket(s)\" | awk '{print $2}'", "r"); + LIBP_ABORT("popen() failed!", + !pipeCores || !pipeSockets); + + std::array buffer; + //read to end of line + LIBP_ABORT("Error reading core count", + !fgets(buffer.data(), buffer.size(), pipeCores)); + int Ncores = std::stoi(buffer.data()); + + //read to end of line + LIBP_ABORT("Error reading core count", + !fgets(buffer.data(), buffer.size(), pipeSockets)); + int Nsockets = std::stoi(buffer.data()); + + pclose(pipeCores); + pclose(pipeSockets); + + // int Ncores = omp_get_num_procs(); + int NcoresPerNode = Ncores*Nsockets; + int Nthreads=0; + + /*Check OMP_NUM_THREADS env variable*/ + std::string ompNumThreads; + char * ompEnvVar = std::getenv("OMP_NUM_THREADS"); + if (ompEnvVar == nullptr) { // Environment variable is not set + Nthreads = std::max(NcoresPerNode/localSize, 1); //Evenly divide number of cores + + // If omp max threads is lower than this (due to binding), go with omp + Nthreads = std::min(Nthreads, omp_get_max_threads()); + } else { + ompNumThreads = ompEnvVar; + // Environmet variable is set, but could be empty string + if (ompNumThreads.size() == 0) { + // Environment variable is set but equal to empty string + Nthreads = std::max(NcoresPerNode/localSize, 1); //Evenly divide number of cores; + + // If omp max threads is lower than this (due to binding), go with omp + Nthreads = std::min(Nthreads, omp_get_max_threads()); + } else { + Nthreads = std::stoi(ompNumThreads); + } + } + LIBP_WARNING("Rank " << rank() << " oversubscribing CPU on node \"" << hostname.ptr() << "\"", + Nthreads*localSize>NcoresPerNode); + omp_set_num_threads(Nthreads); + // omp_set_num_threads(1); - // if (settings.compareSetting("VERBOSE","TRUE")) - // printf("Rank %d: Ncores = %d, Nthreads = %d, device_id = %d \n", rank, Ncores, Nthreads, device_id); + // printf("Rank %d: Nsockets = %d, NcoresPerSocket = %d, Nthreads = %d, device_id = %d \n", + // rank(), Nsockets, Ncores, Nthreads, device_id); +#endif device.setup(mode); - std::string occaCacheDir = LIBP_DIR "/.occa"; - settings.getSetting("CACHE DIR", occaCacheDir); - occa::env::setOccaCacheDir(occaCacheDir); + std::string cacheDir; + char * cacheEnvVar = std::getenv("LIBP_CACHE_DIR"); + if (cacheEnvVar == nullptr) { + // Environment variable is not set + cacheDir = LIBP_DIR "/.occa"; + } + else { + // Environmet variable is set, but could be empty string + cacheDir = cacheEnvVar; + + if (cacheDir.size() == 0) { + // Environment variable is set but equal to empty string + cacheDir = LIBP_DIR "/.occa"; + } + } + setCacheDir(cacheDir); + + comm.Barrier(); } + +} //namespace libp diff --git a/libs/core/platformProperties.cpp b/libs/core/platformProperties.cpp index 9f603bf9d..451586afa 100644 --- a/libs/core/platformProperties.cpp +++ b/libs/core/platformProperties.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,79 +26,89 @@ SOFTWARE. #include "platform.hpp" +namespace libp { + //initialize occa::properties with common props void platform_t::DeviceProperties(){ - props["defines"].asObject(); - props["includes"].asArray(); - props["header"].asArray(); - props["flags"].asObject(); + properties_t& Props = props(); + + Props["defines"].asObject(); + Props["includes"].asArray(); + Props["header"].asArray(); + Props["flags"].asObject(); - props["device"].asObject(); - props["kernel"].asObject(); - props["memory"].asObject(); + Props["device"].asObject(); + Props["kernel"].asObject(); + Props["memory"].asObject(); if(sizeof(dfloat)==4){ - props["defines/" "dfloat"]="float"; - props["defines/" "dfloat2"]="float2"; - props["defines/" "dfloat4"]="float4"; - props["defines/" "dfloat8"]="float8"; + Props["defines/" "dfloat"]="float"; + Props["defines/" "dfloat2"]="float2"; + Props["defines/" "dfloat4"]="float4"; + Props["defines/" "dfloat8"]="float8"; } if(sizeof(dfloat)==8){ - props["defines/" "dfloat"]="double"; - props["defines/" "dfloat2"]="double2"; - props["defines/" "dfloat4"]="double4"; - props["defines/" "dfloat8"]="double8"; + Props["defines/" "dfloat"]="double"; + Props["defines/" "dfloat2"]="double2"; + Props["defines/" "dfloat4"]="double4"; + Props["defines/" "dfloat8"]="double8"; } if(sizeof(pfloat)==4){ - props["defines/" "pfloat"]="float"; - props["defines/" "pfloat2"]="float2"; - props["defines/" "pfloat4"]="float4"; - props["defines/" "pfloat8"]="float8"; + Props["defines/" "pfloat"]="float"; + Props["defines/" "pfloat2"]="float2"; + Props["defines/" "pfloat4"]="float4"; + Props["defines/" "pfloat8"]="float8"; } if(sizeof(pfloat)==8){ - props["defines/" "pfloat"]="double"; - props["defines/" "pfloat2"]="double2"; - props["defines/" "pfloat4"]="double4"; - props["defines/" "pfloat8"]="double8"; + Props["defines/" "pfloat"]="double"; + Props["defines/" "pfloat2"]="double2"; + Props["defines/" "pfloat4"]="double4"; + Props["defines/" "pfloat8"]="double8"; } if(sizeof(dlong)==4){ - props["defines/" "dlong"]="int"; + Props["defines/" "dlong"]="int"; } if(sizeof(dlong)==8){ - props["defines/" "dlong"]="long long int"; + Props["defines/" "dlong"]="long long int"; } if(device.mode()=="Serial") { - props["compiler_flags"] += "-O3 "; - props["compiler_flags"] += "-g "; //debugging + Props["compiler_flags"] += "-O3 "; + Props["compiler_flags"] += "-g "; //debugging + Props["defines/OCCA_USE_SERIAL"] = 1; } if(device.mode()=="CUDA"){ // add backend compiler optimization for CUDA - props["compiler_flags"] += "--ftz=true "; - props["compiler_flags"] += "--prec-div=false "; - props["compiler_flags"] += "--prec-sqrt=false "; - props["compiler_flags"] += "--use_fast_math "; - props["compiler_flags"] += "--fmad=true "; // compiler option for cuda - props["compiler_flags"] += "-Xptxas -dlcm=ca"; + Props["compiler_flags"] += "--ftz=true "; + Props["compiler_flags"] += "--prec-div=false "; + Props["compiler_flags"] += "--prec-sqrt=false "; + Props["compiler_flags"] += "--use_fast_math "; + Props["compiler_flags"] += "--fmad=true "; // compiler option for cuda + Props["compiler_flags"] += "-Xptxas -dlcm=ca"; + Props["defines/OCCA_USE_CUDA"] = 1; } if(device.mode()=="OpenCL"){ // add backend compiler optimization for OPENCL - props["compiler_flags"] += " -cl-std=CL2.0 "; - props["compiler_flags"] += " -cl-strict-aliasing "; - props["compiler_flags"] += " -cl-mad-enable "; - props["compiler_flags"] += " -cl-no-signed-zeros "; - props["compiler_flags"] += " -cl-unsafe-math-optimizations "; - props["compiler_flags"] += " -cl-fast-relaxed-math "; + Props["compiler_flags"] += " -cl-std=CL2.0 "; + Props["compiler_flags"] += " -cl-strict-aliasing "; + Props["compiler_flags"] += " -cl-mad-enable "; + Props["compiler_flags"] += " -cl-no-signed-zeros "; + Props["compiler_flags"] += " -cl-unsafe-math-optimizations "; + Props["compiler_flags"] += " -cl-fast-relaxed-math "; + Props["defines/OCCA_USE_OPENCL"] = 1; } if(device.mode()=="HIP"){ // add backend compiler optimization for HIP - props["compiler_flags"] += " -O3 "; - props["compiler_flags"] += " -ffp-contract=fast "; - // props["compiler_flags"] += " -funsafe-math-optimizations "; - // props["compiler_flags"] += " -ffast-math "; + Props["compiler_flags"] += " -O3 "; + Props["compiler_flags"] += " -ffp-contract=fast "; + Props["compiler_flags"] += " -funsafe-math-optimizations "; + Props["compiler_flags"] += " -ffast-math "; + Props["defines/OCCA_USE_HIP"] = 1; } } + +} //namespace libp diff --git a/libs/core/platformSettings.cpp b/libs/core/platformSettings.cpp index ae59f180b..ba506ed61 100644 --- a/libs/core/platformSettings.cpp +++ b/libs/core/platformSettings.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,7 +26,9 @@ SOFTWARE. #include "platform.hpp" -platformSettings_t::platformSettings_t(MPI_Comm _comm): +namespace libp { + +platformSettings_t::platformSettings_t(comm_t _comm): settings_t(_comm) { //settings format @@ -55,10 +57,7 @@ platformSettings_t::platformSettings_t(MPI_Comm _comm): void platformSettings_t::report() { - int rank; - MPI_Comm_rank(comm, &rank); - - if (rank==0) { + if (comm.rank()==0) { std::cout << "OCCA Settings:\n\n"; reportSetting("THREAD MODEL"); @@ -66,12 +65,12 @@ void platformSettings_t::report() { if (compareSetting("THREAD MODEL","OpenCL")) reportSetting("PLATFORM NUMBER"); - int size; - MPI_Comm_size(comm, &size); - if ((size==1) + if ((comm.size()==1) &&(compareSetting("THREAD MODEL","CUDA") ||compareSetting("THREAD MODEL","HIP") ||compareSetting("THREAD MODEL","OpenCL") )) reportSetting("DEVICE NUMBER"); } -} \ No newline at end of file +} + +} //namespace libp diff --git a/libs/core/rankDecomp.cpp b/libs/core/rankDecomp.cpp new file mode 100644 index 000000000..6353abd21 --- /dev/null +++ b/libs/core/rankDecomp.cpp @@ -0,0 +1,215 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "core.hpp" + +namespace libp { + +// find a factorization n = nx*ny such that +// nx>=ny are 'close' to one another +void Factor2(const int n, int &nx, int &ny) { + //start with guessing nx ~= n^1/2 + nx = round(sqrt(n)); + ny = 1; + + for (;nxnx) std::swap(nx,ny); + + return; + } + } + + //if we made it this far, n is prime + nx = n; +} + +// find a factorization n = nx*ny*nz such that +// nx>=ny>=nz are all 'close' to one another +void Factor3(const int n, int &nx, int &ny, int &nz) { + //start with guessing nx ~= n^1/3 + nx = round(std::cbrt(n)); + ny = nz = 1; + + for (;nxnx) std::swap(nx,ny); + if (nz>ny) std::swap(ny,nz); + if (ny>nx) std::swap(nx,ny); + + return; + } + } + + //if we're here, f is prime + ny = f; + nz = 1; + + //swap if needed + if (ny>nx) std::swap(nx,ny); + + return; + } + } + + //if we made it this far, n is prime + nx = n; +} + +// A function to find largest prime factor +static int maxPrimeFactor(int n) { + int p = -1; + + // Print the number of 2s that divide n + while (n % 2 == 0) { + p = 2; + n >>= 1; // equivalent to n /= 2 + } + // n must be odd at this point + while (n % 3 == 0) { + p = 3; + n=n/3; + } + + // now we have to iterate only for integers + // who does not have prime factor 2 and 3 + for (int i = 5; i <= sqrt(n); i += 6) { + while (n % i == 0) { + p = i; + n = n / i; + } + while (n % (i+2) == 0) { + p = i+2; + n = n / (i+2); + } + } + + // This condition is to handle the case + // when n is a prime number greater than 4 + if (n > 4) p = n; + + return p; +} + +/*Determine the (x,y) coordinates in MPI grid for this process rank*/ +void RankDecomp2(int size_x, int size_y, + int &rank_x, int &rank_y, + const int rank) { + + int size = size_x*size_y; + + if (size==1) { + rank_x=0; + rank_y=0; + return; + } + + /*Determine coordinates via recursive factorization*/ + if (size_y>=size_x) { //size_y is largest + const int p = maxPrimeFactor(size_y); + const int csize = size/p; + const int crank = rank%csize; + + /*Recursive call*/ + int crank_y=-1; + RankDecomp2(size_x, size_y/p, + rank_x, crank_y, crank); + rank_y = crank_y + (rank/csize)*(size_y/p); + } else { //size_x is largest + const int p = maxPrimeFactor(size_x); + const int csize = size/p; + const int crank = rank%csize; + + /*Recursive call*/ + int crank_x=-1; + RankDecomp2(size_x/p, size_y, + crank_x, rank_y, crank); + rank_x = crank_x + (rank/csize)*(size_x/p); + } +} + +/*Determine the (x,y,z) coordinates in MPI grid for this process rank*/ +void RankDecomp3(int size_x, int size_y, int size_z, + int &rank_x, int &rank_y, int &rank_z, + const int rank) { + + int size = size_x*size_y*size_z; + + if (size==1) { + rank_x=0; + rank_y=0; + rank_z=0; + return; + } + + /*Determine coordinates via recursive factorization*/ + if (size_z>=size_x && size_z>=size_y) { //size_z is largest + + const int p = maxPrimeFactor(size_z); + const int csize = size/p; + const int crank = rank%csize; + + /*Recursive call*/ + int crank_z=-1; + RankDecomp3(size_x, size_y, size_z/p, + rank_x, rank_y, crank_z, crank); + rank_z = crank_z + (rank/csize)*(size_z/p); + + } else if (size_y>=size_x && size_y>=size_z) { //size_y is largest + const int p = maxPrimeFactor(size_y); + const int csize = size/p; + const int crank = rank%csize; + + /*Recursive call*/ + int crank_y=-1; + RankDecomp3(size_x, size_y/p, size_z, + rank_x, crank_y, rank_z, crank); + rank_y = crank_y + (rank/csize)*(size_y/p); + } else { //size_x is largest + const int p = maxPrimeFactor(size_x); + const int csize = size/p; + const int crank = rank%csize; + + /*Recursive call*/ + int crank_x=-1; + RankDecomp3(size_x/p, size_y, size_z, + crank_x, rank_y, rank_z, crank); + rank_x = crank_x + (rank/csize)*(size_x/p); + } +} + +} //namespace libp diff --git a/libs/core/settings.cpp b/libs/core/settings.cpp index 321bb8d82..202de2856 100644 --- a/libs/core/settings.cpp +++ b/libs/core/settings.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,8 +26,17 @@ SOFTWARE. #include "settings.hpp" -setting_t::setting_t(string name_, string val_, string description_, vector options_) - : name{name_}, val{val_}, description{description_}, options{options_} {} +namespace libp { + +using std::vector; +using std::string; + +setting_t::setting_t(string name_, string val_, + string description_, vector options_): + name{name_}, + val{val_}, + description{description_}, + options{options_} {} const string& setting_t::getName() const { return name; @@ -57,7 +66,7 @@ void setting_t::updateVal(const string newVal){ << "Possible values are: { "; for (size_t i=0;i options) { auto search = settings.find(name); if (search == settings.end()) { - setting_t *S = new setting_t(name, val, description, options); - settings[name] = S; + settings[name] = setting_t(name, val, description, options); insertOrder.push_back(name); } else { - stringstream ss; - ss << "Setting with name: [" << name << "] already exists."; - LIBP_ABORT(ss.str()); + LIBP_FORCE_ABORT("Setting with name: [" << name << "] already exists."); } } @@ -117,12 +123,10 @@ bool settings_t::hasSetting(const string name) { void settings_t::changeSetting(const string name, const string newVal) { auto search = settings.find(name); if (search != settings.end()) { - setting_t* val = search->second; - val->updateVal(newVal); + setting_t& val = search->second; + val.updateVal(newVal); } else { - stringstream ss; - ss << "Setting with name: [" << name << "] does not exist."; - LIBP_ABORT(ss.str()); + LIBP_FORCE_ABORT("Setting with name: [" << name << "] does not exist."); } } @@ -131,17 +135,13 @@ void settings_t::readSettingsFromFile(string filename) { string line; std::ifstream file; - int rank; - MPI_Comm_rank(comm, &rank); + int rank = comm.rank(); //only the root rank performs the read if (!rank) { file.open(filename); - if (!file.is_open()) { - stringstream ss; - ss << "Failed to open: " << filename.c_str(); - LIBP_ABORT(ss.str()); - } + LIBP_ABORT("Failed to open: " << filename.c_str(), + !file.is_open()); } string name = ""; @@ -151,23 +151,26 @@ void settings_t::readSettingsFromFile(string filename) { int flag; if (!rank) - flag = (getline(file,line)) ? 1 : 0; + flag = (getline(file,line)) ? 1 : 0; + + comm.Bcast(flag, 0); - MPI_Bcast(&flag, 1, MPI_INT, 0, comm); + int MaxLineSize=512; + memory cline; + cline.calloc(MaxLineSize+1); while (flag) { int size; - char *cline; if (!rank) { size = line.length(); + LIBP_ABORT("Line in settings file is too long: " << line, + size>MaxLineSize); } - MPI_Bcast(&size, 1, MPI_INT, 0, comm); + comm.Bcast(size, 0); - cline = (char*) calloc(size+1,sizeof(char)); - if (!rank) strcpy(cline, line.c_str()); - - MPI_Bcast(cline, size, MPI_CHAR, 0, comm); + if (!rank) strcpy(cline.ptr(), line.c_str()); + comm.Bcast(cline, 0, size); for(int i=0; isecond; - return val->getVal(); + const setting_t& val = search->second; + return val.getVal(); } else { - stringstream ss; - ss << "Unable to find setting: [" << name << "]"; - LIBP_ABORT(ss.str()); + LIBP_FORCE_ABORT("Unable to find setting: [" << name << "]"); return string(); } } @@ -227,12 +227,10 @@ string settings_t::getSetting(const string name) const { bool settings_t::compareSetting(const string name, const string token) const { auto search = settings.find(name); if (search != settings.end()) { - setting_t* val = search->second; - return val->compareVal(token); + const setting_t& val = search->second; + return val.compareVal(token); } else { - stringstream ss; - ss << "Unable to find setting: [" << name.c_str() << "]"; - LIBP_ABORT(ss.str()); + LIBP_FORCE_ABORT("Unable to find setting: [" << name.c_str() << "]"); return false; } } @@ -241,24 +239,19 @@ void settings_t::report() { std::cout << "Settings:\n\n"; for (size_t i = 0; i < insertOrder.size(); ++i) { const string &s = insertOrder[i]; - setting_t* val = settings[s]; - std::cout << *val << std::endl; + const setting_t& val = settings[s]; + std::cout << val << std::endl; } } void settings_t::reportSetting(const string name) const { auto search = settings.find(name); if (search != settings.end()) { - setting_t* val = search->second; - std::cout << *val << std::endl; + const setting_t& val = search->second; + std::cout << val << std::endl; } else { - stringstream ss; - ss << "Unable to find setting: [" << name.c_str() << "]"; - LIBP_ABORT(ss.str()); + LIBP_FORCE_ABORT("Unable to find setting: [" << name.c_str() << "]"); } } -settings_t::~settings_t() { - for(auto it = settings.begin(); it != settings.end(); ++it) - delete it->second; -} +} //namespace libp diff --git a/solvers/ins/src/insBoundarySetup.cpp b/libs/core/timer.cpp similarity index 52% rename from solvers/ins/src/insBoundarySetup.cpp rename to libs/core/timer.cpp index fb8a1e83c..382e5a479 100644 --- a/solvers/ins/src/insBoundarySetup.cpp +++ b/libs/core/timer.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -24,32 +24,44 @@ SOFTWARE. */ -#include "ins.hpp" - -void ins_t::BoundarySetup(){ - - //make a node-wise bc flag using the gsop (prioritize Dirichlet boundaries over Neumann) - mapB = (int *) calloc(mesh.Nelements*mesh.Np,sizeof(int)); - const int largeNumber = 1<<20; - for (dlong e=0;e0) { - for (int n=0;nGatherScatter(mapB, ogs_int, ogs_min, ogs_sym); - - for (dlong n=0;n(end-start).count()/(1.0e6); +} + +} //namespace libp diff --git a/libs/linAlg/linAlg.cpp b/libs/linAlg/linAlg.cpp index 6776e8a73..0c0624bf4 100644 --- a/libs/linAlg/linAlg.cpp +++ b/libs/linAlg/linAlg.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,225 +25,202 @@ SOFTWARE. */ #include "linAlg.hpp" +#include "platform.hpp" + +namespace libp { /*********************/ /* vector operations */ /*********************/ // o_a[n] = alpha -void linAlg_t::set(const dlong N, const dfloat alpha, occa::memory& o_a) { +void linAlg_t::set(const dlong N, const dfloat alpha, deviceMemory o_a) { setKernel(N, alpha, o_a); } // o_a[n] += alpha -void linAlg_t::add(const dlong N, const dfloat alpha, occa::memory& o_a) { +void linAlg_t::add(const dlong N, const dfloat alpha, deviceMemory o_a) { addKernel(N, alpha, o_a); } // o_a[n] *= alpha -void linAlg_t::scale(const dlong N, const dfloat alpha, occa::memory& o_a) { +void linAlg_t::scale(const dlong N, const dfloat alpha, deviceMemory o_a) { scaleKernel(N, alpha, o_a); } // o_y[n] = beta*o_y[n] + alpha*o_x[n] -void linAlg_t::axpy(const dlong N, const dfloat alpha, occa::memory& o_x, - const dfloat beta, occa::memory& o_y) { +void linAlg_t::axpy(const dlong N, const dfloat alpha, deviceMemory o_x, + const dfloat beta, deviceMemory o_y) { axpyKernel(N, alpha, o_x, beta, o_y); } // o_z[n] = beta*o_y[n] + alpha*o_x[n] -void linAlg_t::zaxpy(const dlong N, const dfloat alpha, occa::memory& o_x, - const dfloat beta, occa::memory& o_y, occa::memory& o_z) { +void linAlg_t::zaxpy(const dlong N, const dfloat alpha, deviceMemory o_x, + const dfloat beta, deviceMemory o_y, deviceMemory o_z) { zaxpyKernel(N, alpha, o_x, beta, o_y, o_z); } // o_x[n] = alpha*o_a[n]*o_x[n] void linAlg_t::amx(const dlong N, const dfloat alpha, - occa::memory& o_a, occa::memory& o_x) { + deviceMemory o_a, deviceMemory o_x) { amxKernel(N, alpha, o_a, o_x); } // o_y[n] = alpha*o_a[n]*o_x[n] + beta*o_y[n] void linAlg_t::amxpy(const dlong N, const dfloat alpha, - occa::memory& o_a, occa::memory& o_x, - const dfloat beta, occa::memory& o_y) { + deviceMemory o_a, deviceMemory o_x, + const dfloat beta, deviceMemory o_y) { amxpyKernel(N, alpha, o_a, o_x, beta, o_y); } // o_z[n] = alpha*o_a[n]*o_x[n] + beta*o_y[n] void linAlg_t::zamxpy(const dlong N, const dfloat alpha, - occa::memory& o_a, occa::memory& o_x, - const dfloat beta, occa::memory& o_y, occa::memory& o_z) { + deviceMemory o_a, deviceMemory o_x, + const dfloat beta, deviceMemory o_y, deviceMemory o_z) { zamxpyKernel(N, alpha, o_a, o_x, beta, o_y, o_z); } // o_x[n] = alpha*o_x[n]/o_a[n] void linAlg_t::adx(const dlong N, const dfloat alpha, - occa::memory& o_a, occa::memory& o_x) { + deviceMemory o_a, deviceMemory o_x) { adxKernel(N, alpha, o_a, o_x); } // o_y[n] = alpha*o_x[n]/o_a[n] + beta*o_y[n] void linAlg_t::adxpy(const dlong N, const dfloat alpha, - occa::memory& o_a, occa::memory& o_x, - const dfloat beta, occa::memory& o_y) { + deviceMemory o_a, deviceMemory o_x, + const dfloat beta, deviceMemory o_y) { adxpyKernel(N, alpha, o_a, o_x, beta, o_y); } // o_z[n] = alpha*o_x[n]/o_a[n] + beta*o_y[n] void linAlg_t::zadxpy(const dlong N, const dfloat alpha, - occa::memory& o_a, occa::memory& o_x, - const dfloat beta, occa::memory& o_y, occa::memory& o_z) { + deviceMemory o_a, deviceMemory o_x, + const dfloat beta, deviceMemory o_y, deviceMemory o_z) { zadxpyKernel(N, alpha, o_a, o_x, beta, o_y, o_z); } // \min o_a -dfloat linAlg_t::min(const dlong N, occa::memory& o_a, MPI_Comm comm) { - //TODO, maybe complete reduction on device with second kernel? +dfloat linAlg_t::min(const dlong N, deviceMemory o_a, comm_t comm) { int Nblock = (N+blocksize-1)/blocksize; Nblock = (Nblock>blocksize) ? blocksize : Nblock; //limit to blocksize entries - minKernel(Nblock, N, o_a, o_scratch); - - o_scratch.copyTo(scratch, Nblock*sizeof(dfloat)); + minKernel1(Nblock, N, o_a, o_scratch); + minKernel2(Nblock, o_scratch); - dfloat min = std::numeric_limits::max(); - for(dlong n=0;nfinish(); - dfloat globalmin = 0; - MPI_Allreduce(&min, &globalmin, 1, MPI_DFLOAT, MPI_MIN, comm); + dfloat globalmin = h_scratch[0]; + comm.Allreduce(globalmin, Comm::Min); return globalmin; } // \max o_a -dfloat linAlg_t::max(const dlong N, occa::memory& o_a, MPI_Comm comm) { - //TODO, maybe complete reduction on device with second kernel? +dfloat linAlg_t::max(const dlong N, deviceMemory o_a, comm_t comm) { int Nblock = (N+blocksize-1)/blocksize; Nblock = (Nblock>blocksize) ? blocksize : Nblock; //limit to blocksize entries - maxKernel(Nblock, N, o_a, o_scratch); - - o_scratch.copyTo(scratch, Nblock*sizeof(dfloat)); + maxKernel1(Nblock, N, o_a, o_scratch); + maxKernel2(Nblock, o_scratch); - dfloat max = -std::numeric_limits::max(); - for(dlong n=0;nmax) ? scratch[n] : max; - } + h_scratch.copyFrom(o_scratch, 1, 0, properties_t("async", true)); + platform->finish(); - dfloat globalmax = 0; - MPI_Allreduce(&max, &globalmax, 1, MPI_DFLOAT, MPI_MAX, comm); + dfloat globalmax = h_scratch[0]; + comm.Allreduce(globalmax, Comm::Max); return globalmax; } // \sum o_a -dfloat linAlg_t::sum(const dlong N, occa::memory& o_a, MPI_Comm comm) { - //TODO, maybe complete reduction on device with second kernel? +dfloat linAlg_t::sum(const dlong N, deviceMemory o_a, comm_t comm) { int Nblock = (N+blocksize-1)/blocksize; Nblock = (Nblock>blocksize) ? blocksize : Nblock; //limit to blocksize entries - sumKernel(Nblock, N, o_a, o_scratch); + sumKernel1(Nblock, N, o_a, o_scratch); + sumKernel2(Nblock, o_scratch); - o_scratch.copyTo(scratch, Nblock*sizeof(dfloat)); + h_scratch.copyFrom(o_scratch, 1, 0, properties_t("async", true)); + platform->finish(); - dfloat sum = 0; - for(dlong n=0;n o_a, comm_t comm) { int Nblock = (N+blocksize-1)/blocksize; Nblock = (Nblock>blocksize) ? blocksize : Nblock; //limit to blocksize entries - norm2Kernel(Nblock, N, o_a, o_scratch); - - o_scratch.copyTo(scratch, Nblock*sizeof(dfloat)); + norm2Kernel1(Nblock, N, o_a, o_scratch); + norm2Kernel2(Nblock, o_scratch); - dfloat norm = 0; - for(dlong n=0;nfinish(); - dfloat globalnorm = 0; - MPI_Allreduce(&norm, &globalnorm, 1, MPI_DFLOAT, MPI_SUM, comm); + dfloat globalnorm = h_scratch[0]; + comm.Allreduce(globalnorm, Comm::Sum); return sqrt(globalnorm); } // o_x.o_y -dfloat linAlg_t::innerProd(const dlong N, occa::memory& o_x, occa::memory& o_y, - MPI_Comm comm) { - //TODO, maybe complete reduction on device with second kernel? +dfloat linAlg_t::innerProd(const dlong N, deviceMemory o_x, deviceMemory o_y, + comm_t comm) { int Nblock = (N+blocksize-1)/blocksize; Nblock = (Nblock>blocksize) ? blocksize : Nblock; //limit to blocksize entries - innerProdKernel(Nblock, N, o_x, o_y, o_scratch); + innerProdKernel1(Nblock, N, o_x, o_y, o_scratch); + innerProdKernel2(Nblock, o_scratch); - o_scratch.copyTo(scratch, Nblock*sizeof(dfloat)); + h_scratch.copyFrom(o_scratch, 1, 0, properties_t("async", true)); + platform->finish(); - dfloat dot = 0; - for(dlong n=0;n o_w, + deviceMemory o_x, deviceMemory o_y, + comm_t comm) { int Nblock = (N+blocksize-1)/blocksize; Nblock = (Nblock>blocksize) ? blocksize : Nblock; //limit to blocksize entries - weightedInnerProdKernel(Nblock, N, o_w, o_x, o_y, o_scratch); - - o_scratch.copyTo(scratch, Nblock*sizeof(dfloat)); + weightedInnerProdKernel1(Nblock, N, o_w, o_x, o_y, o_scratch); + weightedInnerProdKernel2(Nblock, o_scratch); - dfloat dot = 0; - for(dlong n=0;nfinish(); - dfloat globaldot = 0; - MPI_Allreduce(&dot, &globaldot, 1, MPI_DFLOAT, MPI_SUM, comm); + dfloat globaldot = h_scratch[0]; + comm.Allreduce(globaldot, Comm::Sum); return globaldot; } // ||o_a||_w2 -dfloat linAlg_t::weightedNorm2(const dlong N, occa::memory& o_w, - occa::memory& o_a, MPI_Comm comm) { - //TODO, maybe complete reduction on device with second kernel? +dfloat linAlg_t::weightedNorm2(const dlong N, deviceMemory o_w, + deviceMemory o_a, comm_t comm) { int Nblock = (N+blocksize-1)/blocksize; Nblock = (Nblock>blocksize) ? blocksize : Nblock; //limit to blocksize entries - weightedNorm2Kernel(Nblock, N, o_w, o_a, o_scratch); + weightedNorm2Kernel1(Nblock, N, o_w, o_a, o_scratch); + weightedNorm2Kernel2(Nblock, o_scratch); - o_scratch.copyTo(scratch, Nblock*sizeof(dfloat)); + h_scratch.copyFrom(o_scratch, 1, 0, properties_t("async", true)); + platform->finish(); - dfloat norm = 0; - for(dlong n=0;n A) { + + int n = N; int lwork = 4*N; int info; @@ -49,47 +52,35 @@ double matrixConditionNumber(int N, double *A) { double Acond; double Anorm; - double *tmpLU = (double*) calloc(N*N, sizeof(double)); + memory tmpLU(N*N); - int *ipiv = (int*) calloc(N, sizeof(int)); - double *work = (double*) calloc(lwork, sizeof(double)); - int *iwork = (int*) calloc(N, sizeof(int)); + memory ipiv(N); + memory work(lwork); + memory iwork(N); - for(int n=0;n A) { + int n = N; int lwork = 4*N; int info; @@ -98,41 +89,30 @@ float matrixConditionNumber(int N, float *A) { float Acond; float Anorm; - float *tmpLU = (float*) calloc(N*N, sizeof(float)); + memory tmpLU(N*N); - int *ipiv = (int*) calloc(N, sizeof(int)); - float *work = (float*) calloc(lwork, sizeof(float)); - int *iwork = (int*) calloc(N, sizeof(int)); + memory ipiv(N); + memory work(lwork); + memory iwork(N); - for(int n=0;n A, + memory VR, + memory WR, + memory WI){ + + int n = N; + char JOBVL = 'N'; + char JOBVR = 'V'; + int LDA = N; + int LDVL = N; + int LDVR = N; + int LWORK = 8*N; + + memory WORK(LWORK); + memory tmpA(N*LDA); + memory tmpVR(N*LDVR); + + //tmpA = A^T (row major to column-major) + linAlg_t::matrixTranspose(N, N, A, LDA, tmpA, LDA); + + int INFO = -999; + + dgeev_ (&JOBVL, &JOBVR, &n, tmpA.ptr(), &LDA, WR.ptr(), WI.ptr(), + nullptr, &LDVL, tmpVR.ptr(), &LDVR, WORK.ptr(), &LWORK, &INFO); + + LIBP_ABORT("dgeev_ reports info = " << INFO, INFO); + + //VR = tmpVR^T (column major to row major) + linAlg_t::matrixTranspose(N, N, tmpVR, LDVR, VR, LDVR); +} + +// compute right eigenvectors +void linAlg_t::matrixEigenVectors(const int N, const memory A, + memory VR, + memory WR, + memory WI){ + + int n = N; + char JOBVL = 'N'; + char JOBVR = 'V'; + int LDA = N; + int LDVL = N; + int LDVR = N; + int LWORK = 8*N; + + memory WORK(LWORK); + memory tmpA(N*LDA); + memory tmpVR(N*LDVR); + + //tmpA = A^T (row major to column-major) + linAlg_t::matrixTranspose(N, N, A, LDA, tmpA, LDA); + + int INFO = -999; + + sgeev_ (&JOBVL, &JOBVR, &n, tmpA.ptr(), &LDA, WR.ptr(), WI.ptr(), + nullptr, &LDVL, tmpVR.ptr(), &LDVR, WORK.ptr(), &LWORK, &INFO); + + LIBP_ABORT("sgeev_ reports info = " << INFO, INFO); + + //VR = tmpVR^T (column major to row major) + linAlg_t::matrixTranspose(N, N, tmpVR, LDVR, VR, LDVR); +} + +// compute eigenvalues +void linAlg_t::matrixEigenValues(const int N, const memory A, + memory WR, + memory WI){ + + int n = N; + char JOBVL = 'N'; + char JOBVR = 'N'; + int LDA = N; + int LDVL = N; + int LDVR = N; + int LWORK = 8*N; + + double* Aptr = const_cast(A.ptr()); + + memory WORK(LWORK); + + int INFO = -999; + + dgeev_ (&JOBVL, &JOBVR, &n, Aptr, &LDA, WR.ptr(), WI.ptr(), + nullptr, &LDVL, nullptr, &LDVR, WORK.ptr(), &LWORK, &INFO); + + LIBP_ABORT("dgeev_ reports info = " << INFO, INFO); +} + +// compute eigenvalues +void linAlg_t::matrixEigenValues(const int N, const memory A, + memory WR, + memory WI){ + + int n = N; + char JOBVL = 'N'; + char JOBVR = 'N'; + int LDA = N; + int LDVL = N; + int LDVR = N; + int LWORK = 8*N; + + float* Aptr = const_cast(A.ptr()); + + memory WORK(LWORK); + + int INFO = -999; + + sgeev_ (&JOBVL, &JOBVR, &n, Aptr, &LDA, WR.ptr(), WI.ptr(), + nullptr, &LDVL, nullptr, &LDVR, WORK.ptr(), &LWORK, &INFO); + + LIBP_ABORT("sgeev_ reports info = " << INFO, INFO); +} + +} //namespace libp diff --git a/libs/core/matrixInverse.cpp b/libs/linAlg/linAlgMatrixInverse.cpp similarity index 56% rename from libs/core/matrixInverse.cpp rename to libs/linAlg/linAlgMatrixInverse.cpp index 9c5938513..d81f53394 100644 --- a/libs/core/matrixInverse.cpp +++ b/libs/linAlg/linAlgMatrixInverse.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -24,7 +24,7 @@ SOFTWARE. */ -#include "core.hpp" +#include "linAlg.hpp" extern "C" { void dgetrf_(int* M, int *N, double* A, int* lda, int* IPIV, int* INFO); @@ -34,58 +34,42 @@ extern "C" { void sgetri_(int* N, float* A, int* lda, int* IPIV, float* WORK, int* lwork, int* INFO); } -void matrixInverse(int N, double *A){ +namespace libp { + +void linAlg_t::matrixInverse(const int N, memory A){ + int n = N; int lwork = N*N; int info; - // compute inverse mass matrix - int *ipiv = (int*) calloc(N, sizeof(int)); - double *work = (double*) calloc(lwork, sizeof(double)); - - dgetrf_ (&N, &N, A, &N, ipiv, &info); + // compute inverse matrix in-place + memory ipiv(N); + memory work(lwork); - if(info) { - std::stringstream ss; - ss << "dgetrf_ reports info = " << info; - LIBP_ABORT(ss.str()); - } + dgetrf_ (&n, &n, A.ptr(), &n, ipiv.ptr(), &info); - dgetri_ (&N, A, &N, ipiv, work, &lwork, &info); + LIBP_ABORT("dgetrf_ reports info = " << info, info); - if(info) { - std::stringstream ss; - ss << "dgetri_ reports info = " << info; - LIBP_ABORT(ss.str()); - } + dgetri_ (&n, A.ptr(), &n, ipiv.ptr(), work.ptr(), &lwork, &info); - free(work); - free(ipiv); + LIBP_ABORT("dgetri_ reports info = " << info, info); } -void matrixInverse(int N, float *A){ +void linAlg_t::matrixInverse(const int N, memory A){ + int n = N; int lwork = N*N; int info; - // compute inverse mass matrix - int *ipiv = (int*) calloc(N, sizeof(int)); - float *work = (float*) calloc(lwork, sizeof(float)); + // compute inverse matrix in-place + memory ipiv(N); + memory work(lwork); - sgetrf_ (&N, &N, A, &N, ipiv, &info); + sgetrf_ (&n, &n, A.ptr(), &n, ipiv.ptr(), &info); - if(info) { - std::stringstream ss; - ss << "sgetrf_ reports info = " << info; - LIBP_ABORT(ss.str()); - } + LIBP_ABORT("sgetrf_ reports info = " << info, info); - sgetri_ (&N, A, &N, ipiv, work, &lwork, &info); + sgetri_ (&n, A.ptr(), &n, ipiv.ptr(), work.ptr(), &lwork, &info); - if(info) { - std::stringstream ss; - ss << "sgetri_ reports info = " << info; - LIBP_ABORT(ss.str()); - } - - free(work); - free(ipiv); + LIBP_ABORT("sgetri_ reports info = " << info, info); } + +} //namespace libp diff --git a/libs/linAlg/linAlgMatrixRightSolve.cpp b/libs/linAlg/linAlgMatrixRightSolve.cpp new file mode 100644 index 000000000..05019cf62 --- /dev/null +++ b/libs/linAlg/linAlgMatrixRightSolve.cpp @@ -0,0 +1,381 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim WarburtonTim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "linAlg.hpp" + +extern "C" { + void dgesv_ ( int *N, int *NRHS, double *A, + int *LDA, + int *IPIV, + double *B, + int *LDB, + int *INFO ); + + void sgesv_ ( int *N, int *NRHS, float *A, + int *LDA, + int *IPIV, + float *B, + int *LDB, + int *INFO ); + + void dgels_ ( char *TRANS, + int *M, + int *N, + int *NRHS, + double *A, + int *LDA, + double *B, + int *LDB, + double *WORK, + int *LWORK, + int *INFO); + + void sgels_ ( char *TRANS, + int *M, + int *N, + int *NRHS, + float *A, + int *LDA, + float *B, + int *LDB, + float *WORK, + int *LWORK, + int *INFO); + + void dgeqp3_( int *M, + int *N, + double *A, + int *LDA, + int *JPVT, + double *TAU, + double *WORK, + int *LWORK, + int *INFO); + + void sgeqp3_( int *M, + int *N, + float *A, + int *LDA, + int *JPVT, + float *TAU, + float *WORK, + int *LWORK, + int *INFO); + + void dormqr_( char *SIDE, + char *TRANS, + int *M, + int *N, + int *K, + double *A, + int *LDA, + double *TAU, + double *C, + int *LDC, + double *WORK, + int *LWORK, + int *INFO); + + void sormqr_( char *SIDE, + char *TRANS, + int *M, + int *N, + int *K, + float *A, + int *LDA, + float *TAU, + float *C, + int *LDC, + float *WORK, + int *LWORK, + int *INFO); + + void dtrsm_ ( char *SIDE, + char *UPLO, + char *TRANSA, + char *DIAG, + int *M, + int *N, + double *ALPHA, + double *A, + int *LDA, + double *B, + int *LDB); + + void strsm_ ( char *SIDE, + char *UPLO, + char *TRANSA, + char *DIAG, + int *M, + int *N, + float *ALPHA, + float *A, + int *LDA, + float *B, + int *LDB); +} + +namespace libp { + +// C = A/B = trans(trans(B)\trans(A)) +// assume row major +void linAlg_t::matrixRightSolve(const int NrowsA, const int NcolsA, const memory A, + const int NrowsB, const int NcolsB, const memory B, + memory C){ + + int info; + + int NrowsX = NcolsB; + int NcolsX = NrowsB; + + int NrowsY = NcolsA; + int NcolsY = NrowsA; + + int lwork = NrowsX*NcolsX; + + // compute inverse mass matrix + memory tmpX(NrowsX*NcolsX); + memory ipiv(NrowsX); + memory work(lwork); + + tmpX.copyFrom(B, NrowsX*NcolsX); + C.copyFrom(A, NrowsY*NcolsY); + + dgesv_(&NrowsX, &NcolsY, tmpX.ptr(), &NrowsX, ipiv.ptr(), C.ptr(), &NrowsY, &info); + + LIBP_ABORT("dgesv_ reports info = " << info, info); +} + +// C = A/B = trans(trans(B)\trans(A)) +// assume row major +void linAlg_t::matrixRightSolve(const int NrowsA, const int NcolsA, const memory A, + const int NrowsB, const int NcolsB, const memory B, + memory C){ + + int info; + + int NrowsX = NcolsB; + int NcolsX = NrowsB; + + int NrowsY = NcolsA; + int NcolsY = NrowsA; + + int lwork = NrowsX*NcolsX; + + // compute inverse mass matrix + memory tmpX(NrowsX*NcolsX); + memory ipiv(NrowsX); + memory work(lwork); + + tmpX.copyFrom(B, NrowsX*NcolsX); + C.copyFrom(A, NrowsY*NcolsY); + + sgesv_(&NrowsX, &NcolsY, tmpX.ptr(), &NrowsX, ipiv.ptr(), C.ptr(), &NrowsY, &info); // ? + + LIBP_ABORT("sgesv_ reports info = " << info, info); +} + +// Find minimum-norm solution to xA = b with NrowsA > NcolsA (underdetermined). +// +// NB: A must be stored ROW MAJOR. +void linAlg_t::matrixUnderdeterminedRightSolveMinNorm(const int NrowsA, const int NcolsA, + const memory A, const memory b, + memory x) { + // Solve A^T x^T = b^T. Note TRANS = 'N', since A is row major. + int INFO = 0; + char TRANS = 'N'; + int NRHS = 1; + int LWORK = 2*NrowsA*NcolsA; + int Nrows = NrowsA; + int Ncols = NcolsA; + + memory WORK(LWORK); + memory tmpA(NrowsA*NcolsA); + memory tmpb(NrowsA); + + tmpA.copyFrom(A, NrowsA*NcolsA); + tmpb.copyFrom(b, NcolsA); + + dgels_(&TRANS, &Ncols, &Nrows, &NRHS, tmpA.ptr(), &Ncols, tmpb.ptr(), &Nrows, WORK.ptr(), &LWORK, &INFO); + + LIBP_ABORT("dgels_ returned INFO = " << INFO, INFO); + + // Copy to output. + x.copyFrom(tmpb, NrowsA); +} + +// Find minimum-norm solution to xA = b with NrowsA > NcolsA (underdetermined). +// +// NB: A must be stored ROW MAJOR. +void linAlg_t::matrixUnderdeterminedRightSolveMinNorm(const int NrowsA, const int NcolsA, + const memory A, const memory b, + memory x) { + // Solve A^T x^T = b^T. Note TRANS = 'N', since A is row major. + int INFO = 0; + char TRANS = 'N'; + int NRHS = 1; + int LWORK = 2*NrowsA*NcolsA; + int Nrows = NrowsA; + int Ncols = NcolsA; + + memory WORK(LWORK); + memory tmpA(NrowsA*NcolsA); + memory tmpb(NrowsA); + + tmpA.copyFrom(A, NrowsA*NcolsA); + tmpb.copyFrom(b, NcolsA); + + sgels_(&TRANS, &Ncols, &Nrows, &NRHS, tmpA.ptr(), &Ncols, tmpb.ptr(), &Nrows, WORK.ptr(), &LWORK, &INFO); + + LIBP_ABORT("dgels_ returned INFO = " << INFO, INFO); + + // Copy to output. + x.copyFrom(tmpb, NrowsA); +} + +// Solve xA = b with NrowsA > NcolsA (underdetermined) using column-pivoted QR. +// +// Done by solving A^T x^T = b^T in 4 steps: +// 1. Decompose A^T * P = Q * R. --> Q * R * P^T x^T = b^T +// 2. Multiply by Q^T. --> R * P^T x^T = Q^T b^T +// 3. Backsolve with R1. --> P^T * x^T = R1^{-1} Q^T b^T +// where R1 = leading NcolsA * NcolsA submatrix of R. +// 4. Apply permutation. --> x^T = P R1^{-1} Q^T b^T +// +// NB: A must be stored ROW MAJOR. +void linAlg_t::matrixUnderdeterminedRightSolveCPQR(const int NrowsA, const int NcolsA, + const memory A, const memory b, + memory x) { + int INFO = 0; + int LWORK = 3*NrowsA + 1; + int Nrows = NrowsA; + int Ncols = NcolsA; + + memory JPVT(NrowsA, 0); + memory TAU(std::min(NrowsA, NcolsA)); + + memory WORK; + memory tmpA(NrowsA*NcolsA); + memory tmpb(NrowsA, 0.0); + + WORK.malloc(LWORK); + tmpA.copyFrom(A, NrowsA*NcolsA); + tmpb.copyFrom(b, NcolsA); + + // Compute A^T * P = Q * R. + dgeqp3_(&Ncols, &Nrows, tmpA.ptr(), &Ncols, JPVT.ptr(), TAU.ptr(), WORK.ptr(), &LWORK, &INFO); + + LIBP_ABORT("dgeqp3_ returned INFO = " << INFO, INFO); + + // Compute Q^T * b^T. + char SIDE = 'L'; + char TRANS = 'T'; + int NRHS = 1; + int NREFLS = NcolsA; + + LWORK = 1; + WORK.malloc(LWORK); + dormqr_(&SIDE, &TRANS, &Ncols, &NRHS, &NREFLS, tmpA.ptr(), &Ncols, TAU.ptr(), tmpb.ptr(), &Ncols, WORK.ptr(), &LWORK, &INFO); + + LIBP_ABORT("dormqr_ returned INFO = " << INFO, INFO); + + // Compute R1^{-1} * Q^T * b^T + SIDE = 'L'; + char UPLO = 'U'; + char TRANSA = 'N'; + char DIAG = 'N'; + NRHS = 1; + double ALPHA = 1.0; + + dtrsm_(&SIDE, &UPLO, &TRANSA, &DIAG, &Ncols, &NRHS, &ALPHA, tmpA.ptr(), &Ncols, tmpb.ptr(), &Ncols); + + // Apply the permutation. + for (int i = 0; i < NrowsA; i++) + x[JPVT[i] - 1] = tmpb[i]; +} + +// Solve xA = b with NrowsA > NcolsA (underdetermined) using column-pivoted QR. +// +// Done by solving A^T x^T = b^T in 4 steps: +// 1. Decompose A^T * P = Q * R. --> Q * R * P^T x^T = b^T +// 2. Multiply by Q^T. --> R * P^T x^T = Q^T b^T +// 3. Backsolve with R1. --> P^T * x^T = R1^{-1} Q^T b^T +// where R1 = leading NcolsA * NcolsA submatrix of R. +// 4. Apply permutation. --> x^T = P R1^{-1} Q^T b^T +// +// NB: A must be stored ROW MAJOR. +void linAlg_t::matrixUnderdeterminedRightSolveCPQR(const int NrowsA, const int NcolsA, + const memory A, const memory b, + memory x) { + int INFO = 0; + int LWORK = 3*NrowsA + 1; + int Nrows = NrowsA; + int Ncols = NcolsA; + + memory JPVT(NrowsA, 0); + memory TAU(std::min(NrowsA, NcolsA)); + + memory WORK; + memory tmpA(NrowsA*NcolsA); + memory tmpb(NrowsA, 0.0); + + WORK.malloc(LWORK); + tmpA.copyFrom(A, NrowsA*NcolsA); + tmpb.copyFrom(b, NcolsA); + + // Compute A^T * P = Q * R. + sgeqp3_(&Ncols, &Nrows, tmpA.ptr(), &Ncols, JPVT.ptr(), TAU.ptr(), WORK.ptr(), &LWORK, &INFO); + + LIBP_ABORT("dgeqp3_ returned INFO = " << INFO, INFO); + + // Compute Q^T * b^T. + char SIDE = 'L'; + char TRANS = 'T'; + int NRHS = 1; + int NREFLS = NcolsA; + + LWORK = 1; + WORK.malloc(LWORK); + sormqr_(&SIDE, &TRANS, &Ncols, &NRHS, &NREFLS, tmpA.ptr(), &Ncols, TAU.ptr(), tmpb.ptr(), &Ncols, WORK.ptr(), &LWORK, &INFO); + + LIBP_ABORT("dormqr_ returned INFO = " << INFO, INFO); + + // Compute R1^{-1} * Q^T * b^T + SIDE = 'L'; + char UPLO = 'U'; + char TRANSA = 'N'; + char DIAG = 'N'; + NRHS = 1; + float ALPHA = 1.0; + + strsm_(&SIDE, &UPLO, &TRANSA, &DIAG, &Ncols, &NRHS, &ALPHA, tmpA.ptr(), &Ncols, tmpb.ptr(), &Ncols); + + // Apply the permutation. + for (int i = 0; i < NrowsA; i++) + x[JPVT[i] - 1] = tmpb[i]; +} + +} //namespace libp diff --git a/libs/core/matrixTranspose.cpp b/libs/linAlg/linAlgMatrixTranspose.cpp similarity index 61% rename from libs/core/matrixTranspose.cpp rename to libs/linAlg/linAlgMatrixTranspose.cpp index 0277cdd32..49c06949e 100644 --- a/libs/core/matrixTranspose.cpp +++ b/libs/linAlg/linAlgMatrixTranspose.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -24,13 +24,15 @@ SOFTWARE. */ -#include "core.hpp" +#include "linAlg.hpp" + +namespace libp { template -inline +static inline void matrixTranspose_t(const int M, const int N, - const T *A, const int LDA, - T *AT, const int LDAT){ + const memory A, const int LDA, + memory AT, const int LDAT){ //A & A^T - Row major ordering //M = number of rows of A, columns of A^T @@ -42,10 +44,8 @@ void matrixTranspose_t(const int M, const int N, if (N<1 || M<1) return; //check for weird input - if (LDA A, const int LDA, + memory AT, const int LDAT) { matrixTranspose_t(M, N, A, LDA, AT, LDAT); } -void matrixTranspose(const int M, const int N, - const double *A, const int LDA, - double *AT, const int LDAT) { +void linAlg_t::matrixTranspose(const int M, const int N, + const memory A, const int LDA, + memory AT, const int LDAT) { matrixTranspose_t(M, N, A, LDA, AT, LDAT); } -void matrixTranspose(const int M, const int N, - const int *A, const int LDA, - int *AT, const int LDAT) { +void linAlg_t::matrixTranspose(const int M, const int N, + const memory A, const int LDA, + memory AT, const int LDAT) { matrixTranspose_t(M, N, A, LDA, AT, LDAT); } -void matrixTranspose(const int M, const int N, - const long long int *A, const int LDA, - long long int *AT, const int LDAT) { +void linAlg_t::matrixTranspose(const int M, const int N, + const memory A, const int LDA, + memory AT, const int LDAT) { matrixTranspose_t(M, N, A, LDA, AT, LDAT); -} \ No newline at end of file +} + +} //namespace libp diff --git a/libs/linAlg/linAlgSetup.cpp b/libs/linAlg/linAlgSetup.cpp index 640a75096..1e6b6ca6c 100644 --- a/libs/linAlg/linAlgSetup.cpp +++ b/libs/linAlg/linAlgSetup.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -28,32 +28,30 @@ SOFTWARE. #include "linAlg.hpp" #include "platform.hpp" -#define LINALG_BLOCKSIZE 512 +namespace libp { -linAlg_t::linAlg_t(): blocksize(LINALG_BLOCKSIZE) {}; +linAlg_t::linAlg_t() {}; void linAlg_t::Setup(platform_t *_platform) { platform = _platform; - kernelInfo = platform->props; + kernelInfo = platform->props(); //add defines - kernelInfo["defines/" "p_blockSize"] = (int)LINALG_BLOCKSIZE; - + kernelInfo["defines/" "p_blockSize"] = blocksize; kernelInfo["defines/init_dfloat_min"] = std::numeric_limits::max(); kernelInfo["defines/init_dfloat_max"] = -std::numeric_limits::max(); //pinned scratch buffer - scratch = (dfloat*) platform->hostMalloc(LINALG_BLOCKSIZE*sizeof(dfloat), - NULL, h_scratch); - o_scratch = platform->malloc(LINALG_BLOCKSIZE*sizeof(dfloat)); + h_scratch = platform->hostMalloc(blocksize); + o_scratch = platform->malloc(blocksize); } //initialize list of kernels -void linAlg_t::InitKernels(vector kernels) { +void linAlg_t::InitKernels(std::vector kernels) { for (size_t i=0;ibuildKernel(LINALG_DIR "/okl/" @@ -121,72 +119,86 @@ void linAlg_t::InitKernels(vector kernels) { "zadxpy", kernelInfo); } else if (name=="min") { - if (minKernel.isInitialized()==false) - minKernel = platform->buildKernel(LINALG_DIR "/okl/" + if (minKernel1.isInitialized()==false) { + minKernel1 = platform->buildKernel(LINALG_DIR "/okl/" + "linAlgMin.okl", + "min1", + kernelInfo); + minKernel2 = platform->buildKernel(LINALG_DIR "/okl/" "linAlgMin.okl", - "min", + "min2", kernelInfo); + } } else if (name=="max") { - if (maxKernel.isInitialized()==false) - maxKernel = platform->buildKernel(LINALG_DIR "/okl/" + if (maxKernel1.isInitialized()==false) { + maxKernel1 = platform->buildKernel(LINALG_DIR "/okl/" + "linAlgMax.okl", + "max1", + kernelInfo); + maxKernel2 = platform->buildKernel(LINALG_DIR "/okl/" "linAlgMax.okl", - "max", + "max2", kernelInfo); + } } else if (name=="sum") { - if (sumKernel.isInitialized()==false) - sumKernel = platform->buildKernel(LINALG_DIR "/okl/" + if (sumKernel1.isInitialized()==false) { + sumKernel1 = platform->buildKernel(LINALG_DIR "/okl/" + "linAlgSum.okl", + "sum1", + kernelInfo); + sumKernel2 = platform->buildKernel(LINALG_DIR "/okl/" "linAlgSum.okl", - "sum", + "sum2", kernelInfo); + } } else if (name=="norm2") { - if (norm2Kernel.isInitialized()==false) - norm2Kernel = platform->buildKernel(LINALG_DIR "/okl/" + if (norm2Kernel1.isInitialized()==false) { + norm2Kernel1 = platform->buildKernel(LINALG_DIR "/okl/" "linAlgNorm2.okl", - "norm2", + "norm2_1", kernelInfo); + norm2Kernel2 = platform->buildKernel(LINALG_DIR "/okl/" + "linAlgNorm2.okl", + "norm2_2", + kernelInfo); + } } else if (name=="weightedNorm2") { - if (weightedNorm2Kernel.isInitialized()==false) - weightedNorm2Kernel = platform->buildKernel(LINALG_DIR "/okl/" + if (weightedNorm2Kernel1.isInitialized()==false) { + weightedNorm2Kernel1 = platform->buildKernel(LINALG_DIR "/okl/" + "linAlgWeightedNorm2.okl", + "weightedNorm2_1", + kernelInfo); + weightedNorm2Kernel2 = platform->buildKernel(LINALG_DIR "/okl/" "linAlgWeightedNorm2.okl", - "weightedNorm2", + "weightedNorm2_1", kernelInfo); + } } else if (name=="innerProd") { - if (innerProdKernel.isInitialized()==false) - innerProdKernel = platform->buildKernel(LINALG_DIR "/okl/" + if (innerProdKernel1.isInitialized()==false) { + innerProdKernel1 = platform->buildKernel(LINALG_DIR "/okl/" "linAlgInnerProd.okl", - "innerProd", + "innerProd1", kernelInfo); + innerProdKernel2 = platform->buildKernel(LINALG_DIR "/okl/" + "linAlgInnerProd.okl", + "innerProd2", + kernelInfo); + } } else if (name=="weightedInnerProd") { - if (weightedInnerProdKernel.isInitialized()==false) - weightedInnerProdKernel = platform->buildKernel(LINALG_DIR "/okl/" + if (weightedInnerProdKernel1.isInitialized()==false) { + weightedInnerProdKernel1 = platform->buildKernel(LINALG_DIR "/okl/" + "linAlgWeightedInnerProd.okl", + "weightedInnerProd1", + kernelInfo); + weightedInnerProdKernel2 = platform->buildKernel(LINALG_DIR "/okl/" "linAlgWeightedInnerProd.okl", - "weightedInnerProd", + "weightedInnerProd2", kernelInfo); + } } else { - stringstream ss; - ss << "Requested linAlg routine \"" << name << "\" not found"; - LIBP_ABORT(ss.str()); + LIBP_FORCE_ABORT("Requested linAlg routine \"" << name << "\" not found"); } } } -linAlg_t::~linAlg_t() { - setKernel.free(); - addKernel.free(); - scaleKernel.free(); - axpyKernel.free(); - zaxpyKernel.free(); - amxKernel.free(); - amxpyKernel.free(); - zamxpyKernel.free(); - adxKernel.free(); - adxpyKernel.free(); - zadxpyKernel.free(); - minKernel.free(); - maxKernel.free(); - sumKernel.free(); - norm2Kernel.free(); - weightedNorm2Kernel.free(); - innerProdKernel.free(); - weightedInnerProdKernel.free(); -} \ No newline at end of file +} //namespace libp diff --git a/libs/linAlg/okl/linAlgADXPY.okl b/libs/linAlg/okl/linAlgADXPY.okl index dacd3bfa8..fc5fa9f89 100644 --- a/libs/linAlg/okl/linAlgADXPY.okl +++ b/libs/linAlg/okl/linAlgADXPY.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -60,4 +60,4 @@ SOFTWARE. for(dlong n=0;n512 for(int t=0;t256 for(int t=0;t512 + for(int t=0;t256 + for(int t=0;ts_max[t]) ? x[id] : s_max[t]; - id += p_blockSize*Nblocks; + r_max = (x[id]>r_max) ? x[id] : r_max; + id += p_blockSize; } + s_max[t] = r_max; } #if p_blockSize>512 for(int t=0;ts_max[t]) ? s_max[t+512] : s_max[t]; #endif - #if p_blockSize>256 for(int t=0;ts_max[t]) ? s_max[t+256] : s_max[t]; #endif + for(int t=0;ts_max[t]) ? s_max[t+128] : s_max[t]; + + for(int t=0;ts_max[t]) ? s_max[t+ 64] : s_max[t]; + + for(int t=0;ts_max[t]) ? s_max[t+ 32] : s_max[t]; + + for(int t=0;ts_max[t]) ? s_max[t+ 16] : s_max[t]; + + for(int t=0;ts_max[t]) ? s_max[t+ 8] : s_max[t]; + + for(int t=0;ts_max[t]) ? s_max[t+ 4] : s_max[t]; + + for(int t=0;ts_max[t]) ? s_max[t+ 2] : s_max[t]; + for(int t=0;ts_max[1]) ? s_max[0] : s_max[1]; + } +} + +@kernel void max2(const dlong Nblocks, @restrict dfloat *max){ + + + for(dlong b=0;b<1;++b;@outer(0)){ + + @shared dfloat s_max[p_blockSize]; + + for(int t=0;tr_max) ? max[id] : r_max; + id += p_blockSize; + } + s_max[t] = r_max; + } + +#if p_blockSize>512 + for(int t=0;ts_max[t]) ? s_max[t+512] : s_max[t]; +#endif +#if p_blockSize>256 + for(int t=0;ts_max[t]) ? s_max[t+256] : s_max[t]; +#endif for(int t=0;ts_max[t]) ? s_max[t+128] : s_max[t]; diff --git a/libs/linAlg/okl/linAlgMin.okl b/libs/linAlg/okl/linAlgMin.okl index 78c586118..28e0a7d84 100644 --- a/libs/linAlg/okl/linAlgMin.okl +++ b/libs/linAlg/okl/linAlgMin.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -24,35 +24,87 @@ SOFTWARE. */ -@kernel void min(const dlong Nblocks, - const dlong N, - @restrict const dfloat *x, - @restrict dfloat *min){ +@kernel void min1(const dlong Nblocks, + const dlong N, + @restrict const dfloat *x, + @restrict dfloat *min){ for(dlong b=0;b512 for(int t=0;t256 for(int t=0;t512 + for(int t=0;t256 + for(int t=0;t512 for(int t=0;t256 for(int t=0;t512 - for(int t=0;t256 - for(int t=0;t512 for(int t=0;t256 for(int t=0;t512 + for(int t=0;t256 + for(int t=0;t512 - for(int t=0;t256 - for(int t=0;t512 + for(int t=0;t256 + for(int t=0;tlinearSolver = linearSolver_t::Setup(N, Nhalo, platform, settings, comm); - initialGuessSolver->igStrategy = nullptr; - - if (settings.compareSetting("INITIAL GUESS STRATEGY", "NONE")) { - initialGuessSolver->igStrategy = new igDefaultStrategy(N, platform, settings, comm); - } else if (settings.compareSetting("INITIAL GUESS STRATEGY", "ZERO")) { - initialGuessSolver->igStrategy = new igZeroStrategy(N, platform, settings, comm); - } else if (settings.compareSetting("INITIAL GUESS STRATEGY", "CLASSIC")) { - initialGuessSolver->igStrategy = new igClassicProjectionStrategy(N, platform, settings, comm); - } else if (settings.compareSetting("INITIAL GUESS STRATEGY", "QR")) { - initialGuessSolver->igStrategy = new igRollingQRProjectionStrategy(N, platform, settings, comm); - } else if (settings.compareSetting("INITIAL GUESS STRATEGY", "EXTRAP")) { - initialGuessSolver->igStrategy = new igExtrapStrategy(N, platform, settings, comm); - } else { - LIBP_ABORT("Requested INITIAL GUESS STRATEGY not found."); - } - - return initialGuessSolver; -} - -initialGuessSolver_t::initialGuessSolver_t(dlong _N, dlong _Nhalo, platform_t& _platform, settings_t& _settings, MPI_Comm _comm): - linearSolver_t(_N, _Nhalo, _platform, _settings, _comm), - igStrategy(nullptr), - linearSolver(nullptr) -{ - return; -} +namespace libp { -initialGuessSolver_t::~initialGuessSolver_t() -{ - delete igStrategy; - delete linearSolver; -} - -int initialGuessSolver_t::Solve(solver_t& solver, precon_t& precon, occa::memory& o_x, occa::memory& o_rhs, const dfloat tol, const int MAXIT, const int verbose) -{ - int iter = 0; +namespace InitialGuess { - igStrategy->FormInitialGuess(o_x, o_rhs); - iter = linearSolver->Solve(solver, precon, o_x, o_rhs, tol, MAXIT, verbose); - igStrategy->Update(solver, o_x, o_rhs); - - return iter; -} - -/*****************************************************************************/ +#define IG_BLOCKSIZE 256 -void initialGuessAddSettings(settings_t& settings, const string prefix) +void AddSettings(settings_t& settings, const std::string prefix) { settings.newSetting(prefix + "INITIAL GUESS STRATEGY", "NONE", @@ -100,166 +56,116 @@ void initialGuessAddSettings(settings_t& settings, const string prefix) /*****************************************************************************/ -initialGuessStrategy_t::initialGuessStrategy_t(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm): - platform(_platform), - settings(_settings), - comm(_comm), - Ntotal(_N) -{ - return; -} - -initialGuessStrategy_t::~initialGuessStrategy_t() -{ - return; -} - -/*****************************************************************************/ - -igDefaultStrategy::igDefaultStrategy(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm): +Default::Default(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm): initialGuessStrategy_t(_N, _platform, _settings, _comm) -{ - return; -} +{} -void igDefaultStrategy::FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs) -{ - return; -} +void Default::FormInitialGuess(deviceMemory& o_x, deviceMemory& o_rhs) +{} -void igDefaultStrategy::Update(solver_t &solver, occa::memory& o_x, occa::memory& o_rhs) -{ - return; -} +void Default::Update(operator_t &linearOperator, deviceMemory& o_x, deviceMemory& o_rhs) +{} /*****************************************************************************/ -igZeroStrategy::igZeroStrategy(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm): +Zero::Zero(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm): initialGuessStrategy_t(_N, _platform, _settings, _comm) { - platform.linAlg.InitKernels({"set"}); - return; + platform.linAlg().InitKernels({"set"}); } -void igZeroStrategy::FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs) +void Zero::FormInitialGuess(deviceMemory& o_x, deviceMemory& o_rhs) { - platform.linAlg.set(Ntotal, 0.0, o_x); - return; + platform.linAlg().set(Ntotal, 0.0, o_x); } -void igZeroStrategy::Update(solver_t &solver, occa::memory& o_x, occa::memory& o_rhs) -{ - return; -} +void Zero::Update(operator_t &linearOperator, deviceMemory& o_x, deviceMemory& o_rhs) +{} /*****************************************************************************/ -igProjectionStrategy::igProjectionStrategy(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm): +Projection::Projection(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm): initialGuessStrategy_t(_N, _platform, _settings, _comm) { curDim = 0; settings.getSetting("INITIAL GUESS HISTORY SPACE DIMENSION", maxDim); - o_btilde = platform.malloc(Ntotal*sizeof(dfloat)); - o_xtilde = platform.malloc(Ntotal*sizeof(dfloat)); - o_Btilde = platform.malloc(Ntotal*maxDim*sizeof(dfloat)); - o_Xtilde = platform.malloc(Ntotal*maxDim*sizeof(dfloat)); + o_btilde = platform.malloc(Ntotal); + o_xtilde = platform.malloc(Ntotal); + o_Btilde = platform.malloc(Ntotal*maxDim); + o_Xtilde = platform.malloc(Ntotal*maxDim); - alphas = new dfloat[maxDim](); - alphasThisRank = new dfloat[maxDim](); - o_alphas = platform.malloc(maxDim*sizeof(dfloat)); + alphas = platform.hostMalloc(maxDim); + o_alphas = platform.malloc(maxDim); - ctmpNblocks = (Ntotal + BLOCKSIZE - 1)/BLOCKSIZE; - ctmp = (dfloat*)calloc(ctmpNblocks*maxDim, sizeof(dfloat)); - o_ctmp = platform.malloc(ctmpNblocks*maxDim*sizeof(dfloat), ctmp); + ctmpNblocks = (Ntotal + IG_BLOCKSIZE - 1)/IG_BLOCKSIZE; + ctmp = platform.hostMalloc(ctmpNblocks*maxDim); + o_ctmp = platform.malloc(ctmpNblocks*maxDim); // Build kernels. - platform.linAlg.InitKernels({"set"}); + platform.linAlg().InitKernels({"set"}); - occa::properties kernelInfo = platform.props; + properties_t kernelInfo = platform.props(); kernelInfo["defines/" "p_igNhist"] = maxDim; igBasisInnerProductsKernel = platform.buildKernel(LINEARSOLVER_DIR "/okl/igBasisInnerProducts.okl", "igBasisInnerProducts", kernelInfo); igReconstructKernel = platform.buildKernel(LINEARSOLVER_DIR "/okl/igReconstruct.okl", "igReconstruct", kernelInfo); igScaleKernel = platform.buildKernel(LINEARSOLVER_DIR "/okl/igScale.okl", "igScale", kernelInfo); igUpdateKernel = platform.buildKernel(LINEARSOLVER_DIR "/okl/igUpdate.okl", "igUpdate", kernelInfo); - - return; -} - -igProjectionStrategy::~igProjectionStrategy() -{ - if (ctmp) - delete[] ctmp; - if (alphas) - delete[] alphas; - if (alphasThisRank) - delete[] alphasThisRank; - - return; } -void igProjectionStrategy::FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs) +void Projection::FormInitialGuess(deviceMemory& o_x, deviceMemory& o_rhs) { if (curDim > 0) { - igBasisInnerProducts(o_rhs, o_Btilde, o_alphas, alphas, alphasThisRank); - platform.linAlg.set(Ntotal, 0.0, o_x); + igBasisInnerProducts(o_rhs, o_Btilde, o_alphas, alphas); + platform.linAlg().set(Ntotal, 0.0, o_x); igReconstruct(o_x, 1.0, o_alphas, o_Xtilde, o_x); } - - return; } -void igProjectionStrategy::igBasisInnerProducts(occa::memory& o_x, occa::memory& o_Q, occa::memory& o_c, dfloat *c, dfloat *cThisRank) +void Projection::igBasisInnerProducts(deviceMemory& o_x, deviceMemory& o_Q, deviceMemory& o_c, pinnedMemory& c) { igBasisInnerProductsKernel(Ntotal, ctmpNblocks, curDim, o_x, o_Q, o_ctmp); - o_ctmp.copyTo(ctmp, ctmpNblocks*curDim*sizeof(dfloat)); + ctmp.copyFrom(o_ctmp, ctmpNblocks*curDim); - dlong cnt = 0; for (int m = 0; m < curDim; ++m) { - cThisRank[m] = 0; + c[m] = 0; for (int n = 0; n < ctmpNblocks; ++n) { - cThisRank[m] += ctmp[cnt]; - ++cnt; + c[m] += ctmp[m*ctmpNblocks + n]; } } - MPI_Allreduce(cThisRank, c, curDim, MPI_DFLOAT, MPI_SUM, comm); - o_c.copyFrom(c, curDim*sizeof(dfloat)); - - return; + comm.Allreduce(c, Comm::Sum, curDim); + c.copyTo(o_c, curDim); } -void igProjectionStrategy::igReconstruct(occa::memory& o_u, dfloat a, occa::memory& o_c, occa::memory& o_Q, occa::memory& o_unew) +void Projection::igReconstruct(deviceMemory& o_u, dfloat a, deviceMemory& o_c, deviceMemory& o_Q, deviceMemory& o_unew) { igReconstructKernel(Ntotal, curDim, o_u, a, o_c, o_Q, o_unew); - return; } /*****************************************************************************/ -igClassicProjectionStrategy::igClassicProjectionStrategy(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm): - igProjectionStrategy(_N, _platform, _settings, _comm) -{ - return; -} +ClassicProjection::ClassicProjection(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm): + Projection(_N, _platform, _settings, _comm) +{} -void igClassicProjectionStrategy::Update(solver_t &solver, occa::memory& o_x, occa::memory& o_rhs) +void ClassicProjection::Update(operator_t &linearOperator, deviceMemory& o_x, deviceMemory& o_rhs) { // Compute RHS corresponding to the approximate solution obtained. - solver.Operator(o_x, o_btilde); + linearOperator.Operator(o_x, o_btilde); // Insert new solution into the initial guess space. if ((curDim >= maxDim) || (curDim == 0)) { dfloat normbtilde = 0.0; - normbtilde = platform.linAlg.norm2(Ntotal, o_btilde, comm); + normbtilde = platform.linAlg().norm2(Ntotal, o_btilde, comm); if (normbtilde > 0) { - igScaleKernel(Ntotal, 1.0/normbtilde, o_btilde, o_Btilde); - igScaleKernel(Ntotal, 1.0/normbtilde, o_x, o_Xtilde); + igScaleKernel(Ntotal, dfloat(1.0)/normbtilde, o_btilde, o_Btilde); + igScaleKernel(Ntotal, dfloat(1.0)/normbtilde, o_x, o_Xtilde); curDim = 1; } @@ -267,17 +173,17 @@ void igClassicProjectionStrategy::Update(solver_t &solver, occa::memory& o_x, oc dfloat invnormbtilde = 0.0; const int Nreorth = 2; - o_x.copyTo(o_xtilde, Ntotal*sizeof(dfloat)); + o_x.copyTo(o_xtilde, Ntotal); // Orthogonalize new RHS against previous ones. for (int n = 0; n < Nreorth; n++) { - igBasisInnerProducts(o_btilde, o_Btilde, o_alphas, alphas, alphasThisRank); - igReconstruct(o_btilde, (dfloat)(-1.0), o_alphas, o_Btilde, o_btilde); - igReconstruct(o_xtilde, (dfloat)(-1.0), o_alphas, o_Xtilde, o_xtilde); + igBasisInnerProducts(o_btilde, o_Btilde, o_alphas, alphas); + igReconstruct(o_btilde, -1.0, o_alphas, o_Btilde, o_btilde); + igReconstruct(o_xtilde, -1.0, o_alphas, o_Xtilde, o_xtilde); } // Normalize. - invnormbtilde = platform.linAlg.norm2(Ntotal, o_btilde, comm); + invnormbtilde = platform.linAlg().norm2(Ntotal, o_btilde, comm); invnormbtilde = 1.0/invnormbtilde; #if 0 @@ -293,38 +199,26 @@ void igClassicProjectionStrategy::Update(solver_t &solver, occa::memory& o_x, oc curDim++; } - - return; } /*****************************************************************************/ -igRollingQRProjectionStrategy::igRollingQRProjectionStrategy(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm): - igProjectionStrategy(_N, _platform, _settings, _comm) +RollingQRProjection::RollingQRProjection(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm): + Projection(_N, _platform, _settings, _comm) { - R = new dfloat[maxDim*maxDim](); - o_R = platform.malloc(maxDim*maxDim*sizeof(dfloat)); + R = platform.hostMalloc(maxDim*maxDim); + o_R = platform.malloc(maxDim*maxDim); - occa::properties kernelInfo = platform.props; + properties_t kernelInfo = platform.props(); kernelInfo["defines/" "p_igNhist"] = maxDim; igDropQRFirstColumnKernel = platform.buildKernel(LINEARSOLVER_DIR "/okl/igDropQRFirstColumn.okl", "igDropQRFirstColumn", kernelInfo); - - return; } -igRollingQRProjectionStrategy::~igRollingQRProjectionStrategy() -{ - if (R) - delete[] R; - - return; -} - -void igRollingQRProjectionStrategy::Update(solver_t &solver, occa::memory& o_x, occa::memory& o_rhs) +void RollingQRProjection::Update(operator_t &linearOperator, deviceMemory& o_x, deviceMemory& o_rhs) { // Compute RHS corresponding to the approximate solution obtained. - solver.Operator(o_x, o_btilde); + linearOperator.Operator(o_x, o_btilde); // Rotate the history space (QR update). if (curDim == maxDim) { @@ -335,7 +229,7 @@ void igRollingQRProjectionStrategy::Update(solver_t &solver, occa::memory& o_x, R[j*maxDim + (maxDim - 1)] = 0.0; } - o_R.copyFrom(R); + R.copyTo(o_R); // Update the RHS and solution spaces. igDropQRFirstColumnKernel(Ntotal, o_Btilde, o_Xtilde, o_R); @@ -346,7 +240,7 @@ void igRollingQRProjectionStrategy::Update(solver_t &solver, occa::memory& o_x, dfloat Rjj = R[j*maxDim + j]; dfloat Rjp1j = R[(j + 1)*maxDim + j]; - givensRotation(Rjj, Rjp1j, &c, &s); + givensRotation(Rjj, Rjp1j, c, s); for (int i = j; i < maxDim; i++) { dfloat Rji = R[j*maxDim + i]; @@ -358,8 +252,8 @@ void igRollingQRProjectionStrategy::Update(solver_t &solver, occa::memory& o_x, } // Copy the updated R back to the device. - platform.device.finish(); - o_R.copyFrom(R); + platform.finish(); + R.copyTo(o_R); curDim--; } @@ -368,7 +262,7 @@ void igRollingQRProjectionStrategy::Update(solver_t &solver, occa::memory& o_x, if (curDim == 0) { dfloat normbtilde = 0.0; - normbtilde = platform.linAlg.norm2(Ntotal, o_btilde, comm); + normbtilde = platform.linAlg().norm2(Ntotal, o_btilde, comm); if (normbtilde > 0) { #if 0 @@ -387,10 +281,10 @@ void igRollingQRProjectionStrategy::Update(solver_t &solver, occa::memory& o_x, dfloat normbtilde = 0.0, normbtildeproj = 0.0;; const int Nreorth = 2; - o_x.copyTo(o_xtilde, Ntotal*sizeof(dfloat)); + o_x.copyTo(o_xtilde, Ntotal); // Compute the initial norm of the new vector. - normbtilde = platform.linAlg.norm2(Ntotal, o_btilde, comm); + normbtilde = platform.linAlg().norm2(Ntotal, o_btilde, comm); // Zero the entries above/on the diagonal of the column of R into which we want to write. for (int i = 0; i < curDim; i++) @@ -398,7 +292,7 @@ void igRollingQRProjectionStrategy::Update(solver_t &solver, occa::memory& o_x, // Orthogonalize new RHS against previous ones. for (int n = 0; n < Nreorth; n++) { - igBasisInnerProducts(o_btilde, o_Btilde, o_alphas, alphas, alphasThisRank); + igBasisInnerProducts(o_btilde, o_Btilde, o_alphas, alphas); igReconstruct(o_btilde, (dfloat)(-1.0), o_alphas, o_Btilde, o_btilde); igReconstruct(o_xtilde, (dfloat)(-1.0), o_alphas, o_Xtilde, o_xtilde); @@ -407,7 +301,7 @@ void igRollingQRProjectionStrategy::Update(solver_t &solver, occa::memory& o_x, } // Normalize. - normbtildeproj = platform.linAlg.norm2(Ntotal, o_btilde, comm); + normbtildeproj = platform.linAlg().norm2(Ntotal, o_btilde, comm); // Only add if the remainder after projection is large enough. // @@ -431,63 +325,57 @@ void igRollingQRProjectionStrategy::Update(solver_t &solver, occa::memory& o_x, } } - o_R.copyFrom(R); + R.copyTo(o_R); } -void igRollingQRProjectionStrategy::givensRotation(dfloat a, dfloat b, dfloat *c, dfloat *s) +void RollingQRProjection::givensRotation(dfloat a, dfloat b, dfloat& c, dfloat& s) { // Compute a Givens rotation that zeros the bottom component of [a ; b]. if (b != 0) { dfloat h = hypot(a, b); dfloat d = 1.0/h; - *c = fabs(a)*d; - *s = copysign(d, a)*b; + c = std::abs(a)*d; + s = std::copysign(d, a)*b; } else { - *c = 1.0; - *s = 0.0; + c = 1.0; + s = 0.0; } - - return; } /*****************************************************************************/ -igExtrapStrategy::igExtrapStrategy(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm): +Extrap::Extrap(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm): initialGuessStrategy_t(_N, _platform, _settings, _comm) { int M, m; settings.getSetting("INITIAL GUESS HISTORY SPACE DIMENSION", M); settings.getSetting("INITIAL GUESS EXTRAP DEGREE", m); - dfloat *c = new dfloat[M](); + memory c(M); extrapCoeffs(m, M, c); Nhistory = M; entry = 0; - o_coeffs = platform.malloc(Nhistory*sizeof(dfloat), c); + o_coeffs = platform.malloc(Nhistory, c); shift = 0; - o_xh = platform.malloc(Nhistory*Ntotal*sizeof(dfloat)); + o_xh = platform.malloc(Nhistory*Ntotal); - platform.linAlg.InitKernels({"set"}); + platform.linAlg().InitKernels({"set"}); - occa::properties kernelInfo = platform.props; + properties_t kernelInfo = platform.props(); kernelInfo["defines/" "p_igNhist"] = Nhistory; igExtrapKernel = platform.buildKernel(LINEARSOLVER_DIR "/okl/igExtrap.okl", "igExtrap", kernelInfo); igExtrapSparseKernel = platform.buildKernel(LINEARSOLVER_DIR "/okl/igExtrap.okl", "igExtrapSparse", kernelInfo); - platform.linAlg.set(Nhistory*Ntotal, 0.0, o_xh); - - delete[] c; - - return; + platform.linAlg().set(Nhistory*Ntotal, 0.0, o_xh); } -void igExtrapStrategy::FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs) +void Extrap::FormInitialGuess(deviceMemory& o_x, deviceMemory& o_rhs) { if (entry < Nhistory) { int M, m; @@ -495,16 +383,14 @@ void igExtrapStrategy::FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs) settings.getSetting("INITIAL GUESS HISTORY SPACE DIMENSION", M); settings.getSetting("INITIAL GUESS EXTRAP DEGREE", m); } else { - M = mymax(1, entry + 1); - m = sqrt((double)M); + M = std::max(1, entry + 1); + m = sqrt(static_cast(M)); } // Construct the extrapolation coefficients. - dfloat *c, *d, *sparseCoeffs; - - c = new dfloat[Nhistory](); - d = new dfloat[Nhistory](); - sparseCoeffs = new dfloat[Nhistory](); + memory c(Nhistory); + memory d(Nhistory); + memory sparseCoeffs(Nhistory); for (int n = 0; n < Nhistory; ++n) { c[n] = 0; d[n] = 0; @@ -512,7 +398,7 @@ void igExtrapStrategy::FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs) } if (M == 1) { - d[Nhistory - 1] = 1.0; + d[Nhistory - 1] = 1.0; } else { extrapCoeffs(m, M, c); @@ -521,23 +407,21 @@ void igExtrapStrategy::FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs) d[Nhistory - M + i] = c[i]; } - int *sparseIds = new int[Nhistory](); + memory sparseIds(Nhistory); Nsparse = 0; for (int n = 0; n < Nhistory; ++n) { - if (fabs(d[n]) > 1e-14) { // hmm + if (std::abs(d[n]) > 1e-14) { // hmm sparseIds[Nsparse] = n; sparseCoeffs[Nsparse] = d[n]; ++Nsparse; } } - o_coeffs = platform.malloc(Nhistory*sizeof(dfloat), d); - o_sparseIds = platform.malloc(Nhistory*sizeof(int), sparseIds); - o_sparseCoeffs = platform.malloc(Nhistory*sizeof(dfloat), sparseCoeffs); + o_coeffs = platform.malloc(d); + o_sparseIds = platform.malloc(sparseIds); + o_sparseCoeffs = platform.malloc(sparseCoeffs); ++entry; - - delete[] sparseIds; } if (settings.compareSetting("INITIAL GUESS EXTRAP COEFFS METHOD", "MINNORM")) @@ -545,50 +429,41 @@ void igExtrapStrategy::FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs) else { igExtrapSparseKernel(Ntotal, Nhistory, shift, Nsparse, o_sparseIds, o_sparseCoeffs, o_xh, o_x); } - - return; } -void igExtrapStrategy::Update(solver_t &solver, occa::memory& o_x, occa::memory& o_rhs) +void Extrap::Update(operator_t &linearOperator, deviceMemory& o_x, deviceMemory& o_rhs) { - occa::memory o_tmp = o_xh + Ntotal*shift*sizeof(dfloat); - o_x.copyTo(o_tmp, Ntotal*sizeof(dfloat)); + deviceMemory o_tmp = o_xh + Ntotal*shift; + o_x.copyTo(o_tmp, Ntotal); shift = (shift + 1) % Nhistory; - - return; } -void igExtrapStrategy::extrapCoeffs(int m, int M, dfloat *c) +void Extrap::extrapCoeffs(int m, int M, memory c) { - dfloat h, ro, *r, *V, *b; - - if (M < m + 1) { - std::stringstream ss; - ss << "Extrapolation space dimension (" << M << ") too low for degree (" << m << ")."; - LIBP_ABORT(ss.str()); - } + LIBP_ABORT("Extrapolation space dimension (" << M << ") too low for degree (" << m << ").", + M < m + 1); - h = 2.0/(M - 1); - r = new dfloat[M](); + const dfloat h = 2.0/(M - 1); + memory r(M); for (int i = 0; i < M; i++) r[i] = -1.0 + i*h; - ro = 1.0 + h; // Evaluation point. - V = new dfloat[(m + 1)*M](); - mesh_t::Vandermonde1D(m, M, r, V); + memory ro(1); + ro[0] = 1.0 + h; // Evaluation point. - b = new dfloat[m + 1](); - mesh_t::Vandermonde1D(m, 1, &ro, b); + memory V; + mesh_t::Vandermonde1D(m, r, V); + + memory b; + mesh_t::Vandermonde1D(m, ro, b); if (settings.compareSetting("INITIAL GUESS EXTRAP COEFFS METHOD", "MINNORM")) { - matrixUnderdeterminedRightSolveMinNorm(M, m + 1, V, b, c); + linAlg_t::matrixUnderdeterminedRightSolveMinNorm(M, m + 1, V, b, c); } else if (settings.compareSetting("INITIAL GUESS EXTRAP COEFFS METHOD", "CPQR")) { - matrixUnderdeterminedRightSolveCPQR(M, m + 1, V, b, c); + linAlg_t::matrixUnderdeterminedRightSolveCPQR(M, m + 1, V, b, c); } +} - delete[] r; - delete[] V; - delete[] b; +} //namespace InitialGuess - return; -} +} //namespace libp diff --git a/libs/linearSolver/linearSolver.cpp b/libs/linearSolver/linearSolver.cpp index ac60163de..50289ccad 100644 --- a/libs/linearSolver/linearSolver.cpp +++ b/libs/linearSolver/linearSolver.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Anthony Austin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,25 +26,33 @@ SOFTWARE. #include "linearSolver.hpp" -//virtual base linear solver class -linearSolver_t* linearSolver_t::Setup(dlong N, dlong Nhalo, - platform_t& platform, settings_t& settings, MPI_Comm comm) { - - linearSolver_t *linearSolver=NULL; - - if (settings.compareSetting("LINEAR SOLVER","NBPCG")){ - linearSolver = new nbpcg(N, Nhalo, platform, settings, comm); - } else if (settings.compareSetting("LINEAR SOLVER","NBFPCG")){ - linearSolver = new nbfpcg(N, Nhalo, platform, settings, comm); - } else if (settings.compareSetting("LINEAR SOLVER","PCG")){ - linearSolver = new pcg(N, Nhalo, platform, settings, comm); - } else if (settings.compareSetting("LINEAR SOLVER","PGMRES")){ - linearSolver = new pgmres(N, Nhalo, platform, settings, comm); - } else if (settings.compareSetting("LINEAR SOLVER","PMINRES")){ - linearSolver = new pminres(N, Nhalo, platform, settings, comm); - } else { - LIBP_ABORT(string("Requested LINEAR SOLVER not found.")); - } - - return linearSolver; +namespace libp { + +int linearSolver_t::Solve(operator_t& linearOperator, + operator_t& precon, + deviceMemory& o_x, + deviceMemory& o_rhs, + const dfloat tol, + const int MAXIT, + const int verbose) { + assertInitialized(); + ig->FormInitialGuess(o_x, o_rhs); + int iters = ls->Solve(linearOperator, precon, o_x, o_rhs, tol, MAXIT, verbose); + ig->Update(linearOperator, o_x, o_rhs); + + return iters; } + +void linearSolver_t::MakeDefaultInitialGuessStrategy() { + ig = std::make_shared(ls->N, ls->platform, + ls->settings, ls->comm); +} + +void linearSolver_t::assertInitialized() { + LIBP_ABORT("LinearSolver not initialized", + ls==nullptr); + LIBP_ABORT("InitialGuess not initialized", + ig==nullptr); +} + +} //namespace libp diff --git a/libs/linearSolver/linearSolverNBFPCG.cpp b/libs/linearSolver/linearSolverNBFPCG.cpp index 3bc8c63a8..20ae83ba8 100644 --- a/libs/linearSolver/linearSolverNBFPCG.cpp +++ b/libs/linearSolver/linearSolverNBFPCG.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,36 +26,37 @@ SOFTWARE. #include "linearSolver.hpp" +namespace libp { + +namespace LinearSolver { + #define NBFPCG_BLOCKSIZE 512 nbfpcg::nbfpcg(dlong _N, dlong _Nhalo, - platform_t& _platform, settings_t& _settings, MPI_Comm _comm): - linearSolver_t(_N, _Nhalo, _platform, _settings, _comm) { + platform_t& _platform, settings_t& _settings, comm_t _comm): + linearSolverBase_t(_N, _Nhalo, _platform, _settings, _comm) { + + platform.linAlg().InitKernels({"axpy", "zaxpy"}); dlong Ntotal = N + Nhalo; /*aux variables */ - o_u = platform.malloc(Ntotal*sizeof(dfloat)); - o_p = platform.malloc(Ntotal*sizeof(dfloat)); - o_w = platform.malloc(Ntotal*sizeof(dfloat)); - o_n = platform.malloc(Ntotal*sizeof(dfloat)); - o_m = platform.malloc(Ntotal*sizeof(dfloat)); - o_s = platform.malloc(Ntotal*sizeof(dfloat)); - o_z = platform.malloc(Ntotal*sizeof(dfloat)); - o_q = platform.malloc(Ntotal*sizeof(dfloat)); - o_Ax = platform.malloc(Ntotal*sizeof(dfloat)); - - localdots = (dfloat*) calloc(4, sizeof(dfloat)); - globaldots = (dfloat*) calloc(4, sizeof(dfloat)); + o_u = platform.malloc(Ntotal); + o_p = platform.malloc(Ntotal); + o_w = platform.malloc(Ntotal); + o_n = platform.malloc(Ntotal); + o_m = platform.malloc(Ntotal); + o_s = platform.malloc(Ntotal); + o_z = platform.malloc(Ntotal); + o_q = platform.malloc(Ntotal); + o_Ax = platform.malloc(Ntotal); //pinned tmp buffer for reductions - tmpdots = (dfloat*) platform.hostMalloc(4*NBFPCG_BLOCKSIZE*sizeof(dfloat), - NULL, h_tmpdots); - - o_tmpdots = platform.malloc(4*NBFPCG_BLOCKSIZE*sizeof(dfloat)); + dots = platform.hostMalloc(4*NBFPCG_BLOCKSIZE); + o_dots = platform.malloc(4*NBFPCG_BLOCKSIZE); /* build kernels */ - occa::properties kernelInfo = platform.props; //copy base properties + properties_t kernelInfo = platform.props(); //copy base properties //add defines kernelInfo["defines/" "p_blockSize"] = (int)NBFPCG_BLOCKSIZE; @@ -67,13 +68,12 @@ nbfpcg::nbfpcg(dlong _N, dlong _Nhalo, "update1NBFPCG", kernelInfo); } -int nbfpcg::Solve(solver_t& solver, precon_t& precon, - occa::memory &o_x, occa::memory &o_r, +int nbfpcg::Solve(operator_t& linearOperator, operator_t& precon, + deviceMemory& o_x, deviceMemory& o_r, const dfloat tol, const int MAXIT, const int verbose) { - int rank; - MPI_Comm_rank(comm, &rank); - linAlg_t &linAlg = platform.linAlg; + int rank = comm.rank(); + linAlg_t &linAlg = platform.linAlg(); // register scalars dfloat alpha0 = 0; @@ -84,7 +84,7 @@ int nbfpcg::Solve(solver_t& solver, precon_t& precon, dfloat rdotr0 = 0; // compute A*x - solver.Operator(o_x, o_Ax); + linearOperator.Operator(o_x, o_Ax); // subtract r = r - A*x linAlg.axpy(N, -1.f, o_Ax, 1.f, o_r); @@ -96,7 +96,7 @@ int nbfpcg::Solve(solver_t& solver, precon_t& precon, o_p.copyFrom(o_u); // w = A*p - solver.Operator(o_p, o_w); + linearOperator.Operator(o_p, o_w); // gamma = u.r // delta = u.w @@ -104,19 +104,19 @@ int nbfpcg::Solve(solver_t& solver, precon_t& precon, precon.Operator(o_w, o_m); - solver.Operator(o_m, o_n); + linearOperator.Operator(o_m, o_n); o_s.copyFrom(o_w); o_q.copyFrom(o_m); o_z.copyFrom(o_n); - MPI_Wait(&request, &status); - gamma0 = globaldots[0]; // udotr - delta0 = globaldots[1]; // udotw - rdotr0 = globaldots[2]; // rdotr + comm.Wait(request); + gamma0 = dots[0]; // udotr + delta0 = dots[1]; // udotw + rdotr0 = dots[2]; // rdotr eta0 = delta0; alpha0 = gamma0/eta0; - dfloat TOL = mymax(tol*tol*rdotr0,tol*tol); + dfloat TOL = std::max(tol*tol*rdotr0,tol*tol); if (verbose&&(rank==0)) printf("NBFPCG: initial res norm %12.12f \n", sqrt(rdotr0)); @@ -147,14 +147,14 @@ int nbfpcg::Solve(solver_t& solver, precon_t& precon, linAlg.axpy(N, 1.0, o_u, 1.0, o_m); // n = A*m - solver.Operator(o_m, o_n); + linearOperator.Operator(o_m, o_n); // block for delta - MPI_Wait(&request, &status); - gamma0 = globaldots[0]; // u.r - beta0 = -globaldots[1]/eta0; // -u.s/eta - delta0 = globaldots[2]; // u.w - rdotr0 = globaldots[3]; // r.r + comm.Wait(request); + gamma0 = dots[0]; // u.r + beta0 = -dots[1]/eta0; // -u.s/eta + delta0 = dots[2]; // u.w + rdotr0 = dots[3]; // r.r // p <= u + beta*p linAlg.axpy(N, 1.0, o_u, beta0, o_p); @@ -187,64 +187,60 @@ int nbfpcg::Solve(solver_t& solver, precon_t& precon, return iter; } -void nbfpcg::Update0NBFPCG(occa::memory &o_r){ +void nbfpcg::Update0NBFPCG(deviceMemory& o_r){ // (u.r) // (u.w) // (r.r) int Nblocks = (N+NBFPCG_BLOCKSIZE-1)/NBFPCG_BLOCKSIZE; - Nblocks = (Nblocks>NBFPCG_BLOCKSIZE) ? NBFPCG_BLOCKSIZE : Nblocks; //limit to NBFPCG_BLOCKSIZE entries - - update0NBFPCGKernel(N, Nblocks, o_u, o_r, o_w, o_tmpdots); + Nblocks = std::min(Nblocks, NBFPCG_BLOCKSIZE); //limit to NBFPCG_BLOCKSIZE entries - o_tmpdots.copyTo(tmpdots, 3*Nblocks*sizeof(dfloat)); + update0NBFPCGKernel(N, Nblocks, o_u, o_r, o_w, o_dots); - localdots[0] = 0; - localdots[1] = 0; - localdots[2] = 0; - for(int n=0;n0) { + dots.copyFrom(o_dots, 3*Nblocks); + } else { + dots[0] = 0.0; + dots[1] = 0.0; + dots[2] = 0.0; } - globaldots[0] = 0; - globaldots[1] = 0; - globaldots[2] = 0; - MPI_Iallreduce(localdots, globaldots, 3, MPI_DFLOAT, MPI_SUM, comm, &request); + for(int n=1;n& o_x, deviceMemory& o_r){ // p <= z + beta*p // s <= Z + beta*s // dot(p,s) int Nblocks = (N+NBFPCG_BLOCKSIZE-1)/NBFPCG_BLOCKSIZE; - Nblocks = (Nblocks>NBFPCG_BLOCKSIZE) ? NBFPCG_BLOCKSIZE : Nblocks; //limit to NBFPCG_BLOCKSIZE entries - - update1NBFPCGKernel(N, Nblocks, o_p, o_s, o_q, o_z, alpha, o_x, o_r, o_u, o_w, o_tmpdots); + Nblocks = std::min(Nblocks,NBFPCG_BLOCKSIZE); //limit to NBFPCG_BLOCKSIZE entries - o_tmpdots.copyTo(tmpdots, 4*Nblocks*sizeof(dfloat)); + update1NBFPCGKernel(N, Nblocks, o_p, o_s, o_q, o_z, alpha, o_x, o_r, o_u, o_w, o_dots); - localdots[0] = 0; - localdots[1] = 0; - localdots[2] = 0; - localdots[3] = 0; - for(int n=0;n0) { + dots.copyFrom(o_dots, 4*Nblocks); + } else { + dots[0] = 0.0; + dots[1] = 0.0; + dots[2] = 0.0; + dots[3] = 0.0; } - globaldots[0] = 0; - globaldots[1] = 0; - globaldots[2] = 0; - globaldots[3] = 0; - MPI_Iallreduce(localdots, globaldots, 4, MPI_DFLOAT, MPI_SUM, comm, &request); + for(int n=1;n dummy(Ntotal, 0.0); - localdots = (dfloat*) calloc(3, sizeof(dfloat)); - globaldots = (dfloat*) calloc(3, sizeof(dfloat)); + /*aux variables */ + o_p = platform.malloc(Ntotal, dummy); + o_s = platform.malloc(Ntotal, dummy); + o_S = platform.malloc(Ntotal, dummy); + o_z = platform.malloc(Ntotal); + o_Z = platform.malloc(Ntotal); + o_Ax = platform.malloc(Ntotal); //pinned tmp buffer for reductions - tmpdots = (dfloat*) platform.hostMalloc(3*NBPCG_BLOCKSIZE*sizeof(dfloat), - NULL, h_tmpdots); - - o_tmpdots = platform.malloc(3*NBPCG_BLOCKSIZE*sizeof(dfloat)); + dots = platform.hostMalloc(3*NBPCG_BLOCKSIZE); + o_dots = platform.malloc(3*NBPCG_BLOCKSIZE); /* build kernels */ - occa::properties kernelInfo = platform.props; //copy base properties + properties_t kernelInfo = platform.props(); //copy base properties //add defines kernelInfo["defines/" "p_blockSize"] = (int)NBPCG_BLOCKSIZE; @@ -64,13 +67,12 @@ nbpcg::nbpcg(dlong _N, dlong _Nhalo, "update2NBPCG", kernelInfo); } -int nbpcg::Solve(solver_t& solver, precon_t& precon, - occa::memory &o_x, occa::memory &o_r, +int nbpcg::Solve(operator_t& linearOperator, operator_t& precon, + deviceMemory& o_x, deviceMemory& o_r, const dfloat tol, const int MAXIT, const int verbose) { - int rank; - MPI_Comm_rank(comm, &rank); - linAlg_t &linAlg = platform.linAlg; + int rank = comm.rank(); + linAlg_t &linAlg = platform.linAlg(); // register scalars dfloat zdotz0 = 0; @@ -84,7 +86,7 @@ int nbpcg::Solve(solver_t& solver, precon_t& precon, dfloat gamma1 = 0; // history gamma // compute A*x - solver.Operator(o_x, o_Ax); + linearOperator.Operator(o_x, o_Ax); // subtract r = r - A*x linAlg.axpy(N, -1.f, o_Ax, 1.f, o_r); @@ -97,14 +99,14 @@ int nbpcg::Solve(solver_t& solver, precon_t& precon, alpha0 = 0; Update2NBPCG(alpha0, o_r); - solver.Operator(o_z, o_Z); + linearOperator.Operator(o_z, o_Z); - MPI_Wait(&request, &status); - gamma0 = globaldots[0]; // rdotz - zdotz0 = globaldots[1]; - rdotr0 = globaldots[2]; + comm.Wait(request); + gamma0 = dots[0]; // rdotz + zdotz0 = dots[1]; + rdotr0 = dots[2]; - dfloat TOL = mymax(tol*tol*rdotr0,tol*tol); + dfloat TOL = std::max(tol*tol*rdotr0,tol*tol); if (verbose&&(rank==0)) printf("NBPCG: initial res norm %12.12f \n", sqrt(rdotr0)); @@ -125,8 +127,8 @@ int nbpcg::Solve(solver_t& solver, precon_t& precon, precon.Operator(o_s, o_S); // block for delta - MPI_Wait(&request, &status); - delta0 = globaldots[0]; + comm.Wait(request); + delta0 = dots[0]; // alpha = gamma/delta alpha0 = gamma0/delta0; @@ -142,14 +144,14 @@ int nbpcg::Solve(solver_t& solver, precon_t& precon, linAlg.axpy(N, alpha0, o_p, 1.0, o_x); // Z = A*z - solver.Operator(o_z, o_Z); + linearOperator.Operator(o_z, o_Z); // block for delta - MPI_Wait(&request, &status); + comm.Wait(request); gamma1 = gamma0; - gamma0 = globaldots[0]; // gamma = r.z - zdotz0 = globaldots[1]; // - rdotr0 = globaldots[2]; // + gamma0 = dots[0]; // gamma = r.z + zdotz0 = dots[1]; // + rdotr0 = dots[2]; // beta0 = gamma0/gamma1; @@ -171,21 +173,23 @@ void nbpcg::Update1NBPCG(const dfloat beta){ // s <= Z + beta*s // dot(p,s) int Nblocks = (N+NBPCG_BLOCKSIZE-1)/NBPCG_BLOCKSIZE; - Nblocks = (Nblocks>NBPCG_BLOCKSIZE) ? NBPCG_BLOCKSIZE : Nblocks; //limit to NBPCG_BLOCKSIZE entries + Nblocks = std::min(Nblocks,NBPCG_BLOCKSIZE); //limit to NBPCG_BLOCKSIZE entries - update1NBPCGKernel(N, Nblocks, o_z, o_Z, beta, o_p, o_s, o_tmpdots); + update1NBPCGKernel(N, Nblocks, o_z, o_Z, beta, o_p, o_s, o_dots); - o_tmpdots.copyTo(tmpdots, Nblocks*sizeof(dfloat)); + if (Nblocks>0) { + dots.copyFrom(o_dots, Nblocks); + } else { + dots[0] = 0.0; + } - localdots[0] = 0; - for(int n=0;n& o_r){ // r <= r - alpha*s // z <= z - alpha*S @@ -195,26 +199,24 @@ void nbpcg::Update2NBPCG(const dfloat alpha, occa::memory &o_r){ int Nblocks = (N+NBPCG_BLOCKSIZE-1)/NBPCG_BLOCKSIZE; Nblocks = (Nblocks>NBPCG_BLOCKSIZE) ? NBPCG_BLOCKSIZE : Nblocks; //limit to NBPCG_BLOCKSIZE entries - update2NBPCGKernel(N, Nblocks, o_s, o_S, alpha, o_r, o_z, o_tmpdots); - - o_tmpdots.copyTo(tmpdots, 3*Nblocks*sizeof(dfloat)); + update2NBPCGKernel(N, Nblocks, o_s, o_S, alpha, o_r, o_z, o_dots); - localdots[0] = 0; - localdots[1] = 0; - localdots[2] = 0; - for(int n=0;n0) { + dots.copyFrom(o_dots, 3*Nblocks); + } else { + dots[0] = 0.0; + dots[1] = 0.0; + dots[2] = 0.0; } - globaldots[0] = 0; - globaldots[1] = 0; - globaldots[2] = 0; - MPI_Iallreduce(localdots, globaldots, 3, MPI_DFLOAT, MPI_SUM, comm, &request); + for(int n=1;n dummy(Ntotal, 0.0); //need this to avoid uninitialized memory warnings + o_p = platform.malloc(dummy); + o_z = platform.malloc(dummy); + o_Ax = platform.malloc(dummy); + o_Ap = platform.malloc(dummy); //pinned tmp buffer for reductions - tmprdotr = (dfloat*) platform.hostMalloc(PCG_BLOCKSIZE*sizeof(dfloat), - NULL, h_tmprdotr); - o_tmprdotr = platform.malloc(PCG_BLOCKSIZE*sizeof(dfloat)); + rdotr = platform.hostMalloc(PCG_BLOCKSIZE); + o_rdotr = platform.malloc(PCG_BLOCKSIZE); /* build kernels */ - occa::properties kernelInfo = platform.props; //copy base properties + properties_t kernelInfo = platform.props(); //copy base properties //add defines kernelInfo["defines/" "p_blockSize"] = (int)PCG_BLOCKSIZE; @@ -60,49 +64,48 @@ pcg::pcg(dlong _N, dlong _Nhalo, "updatePCG", kernelInfo); } -int pcg::Solve(solver_t& solver, precon_t& precon, - occa::memory &o_x, occa::memory &o_r, +int pcg::Solve(operator_t& linearOperator, operator_t& precon, + deviceMemory& o_x, deviceMemory& o_r, const dfloat tol, const int MAXIT, const int verbose) { - int rank; - MPI_Comm_rank(comm, &rank); - linAlg_t &linAlg = platform.linAlg; + int rank = comm.rank(); + linAlg_t &linAlg = platform.linAlg(); // register scalars dfloat rdotz1 = 0.0; dfloat rdotz2 = 0.0; dfloat alpha = 0.0, beta = 0.0, pAp = 0.0; - dfloat rdotr = 0.0; + dfloat rdotr0 = 0.0; dfloat TOL = 0.0; // Comput norm of RHS (for stopping tolerance). if (settings.compareSetting("LINEAR SOLVER STOPPING CRITERION", "ABS/REL-RHS-2NORM")) { dfloat normb = linAlg.norm2(N, o_r, comm); - TOL = mymax(tol*tol*normb*normb, tol*tol); + TOL = std::max(tol*tol*normb*normb, tol*tol); } // compute A*x - solver.Operator(o_x, o_Ax); + linearOperator.Operator(o_x, o_Ax); // subtract r = r - A*x linAlg.axpy(N, -1.f, o_Ax, 1.f, o_r); - rdotr = linAlg.norm2(N, o_r, comm); - rdotr = rdotr*rdotr; + rdotr0 = linAlg.norm2(N, o_r, comm); + rdotr0 = rdotr0*rdotr0; if (settings.compareSetting("LINEAR SOLVER STOPPING CRITERION", "ABS/REL-INITRESID")) { - TOL = mymax(tol*tol*rdotr,tol*tol); + TOL = std::max(tol*tol*rdotr0,tol*tol); } if (verbose&&(rank==0)) - printf("PCG: initial res norm %12.12f \n", sqrt(rdotr)); + printf("PCG: initial res norm %12.12f \n", sqrt(rdotr0)); int iter; for(iter=0;iter 0) && (rdotr <= TOL))) { + if (((iter == 0) && (rdotr0 == 0.0)) || + ((iter > 0) && (rdotr0 <= TOL))) { break; } @@ -124,7 +127,7 @@ int pcg::Solve(solver_t& solver, precon_t& precon, linAlg.axpy(N, 1.f, o_z, beta, o_p); // A*p - solver.Operator(o_p, o_Ap); + linearOperator.Operator(o_p, o_Ap); // p.Ap pAp = linAlg.innerProd(N, o_p, o_Ap, comm); @@ -134,40 +137,39 @@ int pcg::Solve(solver_t& solver, precon_t& precon, // x <= x + alpha*p // r <= r - alpha*A*p // dot(r,r) - rdotr = UpdatePCG(alpha, o_x, o_r); + rdotr0 = UpdatePCG(alpha, o_x, o_r); if (verbose&&(rank==0)) { - if(rdotr<0) - printf("WARNING CG: rdotr = %17.15lf\n", rdotr); + if(rdotr0<0) + printf("WARNING CG: rdotr = %17.15lf\n", rdotr0); - printf("CG: it %d, r norm %12.12le, alpha = %le \n", iter+1, sqrt(rdotr), alpha); + printf("CG: it %d, r norm %12.12le, alpha = %le \n", iter+1, sqrt(rdotr0), alpha); } } return iter; } -dfloat pcg::UpdatePCG(const dfloat alpha, occa::memory &o_x, occa::memory &o_r){ +dfloat pcg::UpdatePCG(const dfloat alpha, deviceMemory& o_x, deviceMemory& o_r){ // x <= x + alpha*p // r <= r - alpha*A*p // dot(r,r) int Nblocks = (N+PCG_BLOCKSIZE-1)/PCG_BLOCKSIZE; - Nblocks = (Nblocks>PCG_BLOCKSIZE) ? PCG_BLOCKSIZE : Nblocks; //limit to PCG_BLOCKSIZE entries + Nblocks = std::min(Nblocks, PCG_BLOCKSIZE); //limit to PCG_BLOCKSIZE entries - updatePCGKernel(N, Nblocks, o_p, o_Ap, alpha, o_x, o_r, o_tmprdotr); + updatePCGKernel(N, Nblocks, o_p, o_Ap, alpha, o_x, o_r, o_rdotr); - o_tmprdotr.copyTo(tmprdotr, Nblocks*sizeof(dfloat)); + rdotr.copyFrom(o_rdotr, Nblocks); dfloat rdotr1 = 0; for(int n=0;n dummy(Ntotal, 0.0); //need this to avoid uninitialized memory warnings - o_V = new occa::memory[restart]; + o_V.malloc(restart); for(int i=0; i(dummy); } - H = (dfloat *) calloc((restart+1)*(restart+1), sizeof(dfloat)); - sn = (dfloat *) calloc(restart, sizeof(dfloat)); - cs = (dfloat *) calloc(restart, sizeof(dfloat)); - s = (dfloat *) calloc(restart+1, sizeof(dfloat)); - y = (dfloat *) calloc(restart, sizeof(dfloat)); + H .malloc((restart+1)*(restart+1), 0.0); + sn.malloc(restart); + cs.malloc(restart); + s.malloc(restart+1); + y.malloc(restart); /*aux variables */ - o_Ax = platform.malloc(Ntotal*sizeof(dfloat), dummy); - o_z = platform.malloc(Ntotal*sizeof(dfloat), dummy); - o_r = platform.malloc(Ntotal*sizeof(dfloat), dummy); - free(dummy); + o_Ax = platform.malloc(dummy); + o_z = platform.malloc(dummy); + o_r = platform.malloc(dummy); } -int pgmres::Solve(solver_t& solver, precon_t& precon, - occa::memory &o_x, occa::memory &o_b, +int pgmres::Solve(operator_t& linearOperator, operator_t& precon, + deviceMemory& o_x, deviceMemory& o_b, const dfloat tol, const int MAXIT, const int verbose) { - int rank; - MPI_Comm_rank(comm, &rank); - linAlg_t &linAlg = platform.linAlg; + int rank = comm.rank(); + linAlg_t &linAlg = platform.linAlg(); // compute A*x - solver.Operator(o_x, o_Ax); + linearOperator.Operator(o_x, o_Ax); // subtract z = b - A*x linAlg.zaxpy(N, -1.f, o_Ax, 1.f, o_b, o_z); @@ -83,7 +84,7 @@ int pgmres::Solve(solver_t& solver, precon_t& precon, dfloat nr = linAlg.norm2(N, o_r, comm); dfloat error = nr; - const dfloat TOL = mymax(tol*nr,tol); + const dfloat TOL = std::max(tol*nr,tol); if (verbose&&(rank==0)) printf("PGMRES: initial res norm %12.12f \n", nr); @@ -104,7 +105,7 @@ int pgmres::Solve(solver_t& solver, precon_t& precon, //Construct orthonormal basis via Gram-Schmidt for(int i=0;i& o_x, const int I){ for(int k=I-1; k>=0; --k){ y[k] = s[k]; @@ -201,16 +202,10 @@ void pgmres::UpdateGMRES(occa::memory& o_x, const int I){ //TODO this is really a GEMM, should write it that way for(int j=0; j dummy(Ntotal, 0.0); + o_p = platform.malloc(dummy); + o_z = platform.malloc(dummy); + o_r = platform.malloc(dummy); + o_r_old = platform.malloc(dummy); + o_q = platform.malloc(dummy); + o_q_old = platform.malloc(dummy); - occa::properties kernelInfo = platform.props; + properties_t kernelInfo = platform.props(); updateMINRESKernel = platform.buildKernel(LINEARSOLVER_DIR "/okl/linearSolverUpdateMINRES.okl", "updateMINRES", kernelInfo); - - return; -} - -pminres::~pminres() -{ - return; } -int pminres::Solve(solver_t& solver, precon_t& precon, - occa::memory &o_x, occa::memory &o_b, +int pminres::Solve(operator_t& linearOperator, operator_t& precon, + deviceMemory& o_x, deviceMemory& o_b, const dfloat tol, const int MAXIT, const int verbose) { - int rank, iter; + int iter; dfloat a0, a1, a2, a3, del, gam, gamp, c, cp, s, sp, eta; dfloat TOL; - MPI_Comm_rank(comm, &rank); - linAlg_t &linAlg = platform.linAlg; + int rank = comm.rank(); + linAlg_t &linAlg = platform.linAlg(); - solver.Operator(o_x, o_r); // r = b - A*x + linearOperator.Operator(o_x, o_r); // r = b - A*x linAlg.axpy(N, 1.0, o_b, -1.0, o_r); precon.Operator(o_r, o_z); // z = M\r gamp = 0.0; - gam = sqrt(innerProd(o_z, o_r)); // gam = sqrt(z . r); + gam = sqrt(linAlg.innerProd(N, o_z, o_r, comm)); // gam = sqrt(z . r); eta = gam; sp = 0.0; s = 0.0; cp = 1.0; c = 1.0; - TOL = mymax(tol*fabs(eta), tol); + TOL = std::max(tol*std::abs(eta), tol); if (verbose && (rank == 0)) { printf("PMINRES: initial eta = % .15e, target %.15e\n", eta, tol); } @@ -91,7 +85,7 @@ int pminres::Solve(solver_t& solver, precon_t& precon, printf("PMINRES: it %3d eta = % .15e, gamma = %.15e\n", iter, eta, gam); } - if ((fabs(eta) < TOL) && (iter >= 1)) { + if ((std::abs(eta) < TOL) && (iter >= 1)) { if (verbose && (rank == 0)) { printf("PMINRES converged in %d iterations (eta = % .15e).\n", iter, eta); } @@ -99,8 +93,8 @@ int pminres::Solve(solver_t& solver, precon_t& precon, } linAlg.scale(N, 1.0/gam, o_z); // z = z/gam - solver.Operator(o_z, o_p); // p = A*z - del = innerProd(o_p, o_z); // del = p . z + linearOperator.Operator(o_z, o_p); // p = A*z + del = linAlg.innerProd(N, o_p, o_z, comm); // del = p . z a0 = c*del - cp*s*gam; a2 = s*del + cp*c*gam; a3 = sp*gam; @@ -122,7 +116,7 @@ int pminres::Solve(solver_t& solver, precon_t& precon, #endif precon.Operator(o_r, o_z); // z = M\r gamp = gam; - gam = sqrt(innerProd(o_z, o_r)); // gam = sqrt(z . r) + gam = sqrt(linAlg.innerProd(N, o_z, o_r, comm)); // gam = sqrt(z . r) a1 = sqrt(a0*a0 + gam*gam); cp = c; c = a0/a1; @@ -138,12 +132,11 @@ int pminres::Solve(solver_t& solver, precon_t& precon, return iter; } -dfloat pminres::innerProd(occa::memory& o_x, occa::memory& o_y) -{ - return platform.linAlg.innerProd(N, o_x, o_y, comm); -} - void pminres::UpdateMINRES(const dfloat ma2, const dfloat ma3, const dfloat alpha, const dfloat beta) { updateMINRESKernel(N, ma2, ma3, alpha, beta, o_z, o_q_old, o_q, o_r_old, o_r, o_p); } + +} //namespace LinearSolver + +} //namespace libp diff --git a/libs/linearSolver/okl/igBasisInnerProducts.okl b/libs/linearSolver/okl/igBasisInnerProducts.okl index 64e9fb165..7b5d12647 100644 --- a/libs/linearSolver/okl/igBasisInnerProducts.okl +++ b/libs/linearSolver/okl/igBasisInnerProducts.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Anthony Austin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -35,7 +35,7 @@ SOFTWARE. { for (dlong b = 0; b < Nblocks; ++b; @outer(0)) { - @shared volatile dfloat s_wxy[p_blockSize]; + @shared dfloat s_wxy[p_blockSize]; // load x to register // block over igNhist @@ -52,7 +52,6 @@ SOFTWARE. for (int fld = 0; fld < dim; ++fld) { - @barrier("local"); for (int t = 0; t < p_blockSize; ++t; @inner(0)) { dlong id = t + p_blockSize*b; @@ -68,7 +67,6 @@ SOFTWARE. s_wxy[t] = res; } - @barrier("local"); #if p_blockSize>512 for(int t=0;t256 @@ -85,7 +82,6 @@ SOFTWARE. s_wxy[t] += s_wxy[t+256]; } - @barrier("local"); #endif for(int t=0;t512 for(int t=0;t256 @@ -71,7 +69,6 @@ s_dot[1][t] += s_dot[1][t+256]; s_dot[2][t] += s_dot[2][t+256]; } - @barrier("local"); #endif for(int t=0;t512 for(int t=0;t256 @@ -208,7 +196,6 @@ s_dot[2][t] += s_dot[2][t+256]; s_dot[3][t] += s_dot[3][t+256]; } - @barrier("local"); #endif for(int t=0;t512 for(int t=0;t256 for(int t=0;t512 for(int t=0;t256 @@ -155,7 +143,6 @@ s_dot[1][t] += s_dot[1][t+256]; s_dot[2][t] += s_dot[2][t+256]; } - @barrier("local"); #endif for(int t=0;t512 for(int t=0;t256 for(int t=0;tFree(); - if (ringHalo) ringHalo->Free(); - if (ogs) ogs->Free(); -} \ No newline at end of file diff --git a/libs/mesh/meshBasis1D.cpp b/libs/mesh/meshBasis1D.cpp index bdfabcc6a..f8b9374c2 100644 --- a/libs/mesh/meshBasis1D.cpp +++ b/libs/mesh/meshBasis1D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim WarburtonTim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,54 +26,66 @@ SOFTWARE. #include "mesh.hpp" +namespace libp { + // ------------------------------------------------------------------------ // 1D NODES // ------------------------------------------------------------------------ -void mesh_t::Nodes1D(int _N, dfloat *_r){ +void mesh_t::Nodes1D(const int _N, memory& _r){ JacobiGLL(_N, _r); //Gauss-Legendre-Lobatto nodes } -void mesh_t::EquispacedNodes1D(int _N, dfloat *_r){ +void mesh_t::EquispacedNodes1D(const int _N, memory& _r){ int _Nq = _N+1; dfloat dr = 2.0/_N; + + _r.malloc(_Nq); for (int i=0;i<_Nq;i++) _r[i] = -1.0 + i*dr; } // ------------------------------------------------------------------------ // ORTHONORMAL BASIS POLYNOMIALS // ------------------------------------------------------------------------ -void mesh_t::OrthonormalBasis1D(dfloat a, int i, dfloat *P){ - *P = JacobiP(a,0,0,i); //Legendre Polynomials +void mesh_t::OrthonormalBasis1D(const dfloat a, const int i, dfloat& P){ + P = JacobiP(a,0,0,i); //Legendre Polynomials } -void mesh_t::GradOrthonormalBasis1D(dfloat a, int i, dfloat *Pr){ - *Pr = GradJacobiP(a,0,0,i); +void mesh_t::GradOrthonormalBasis1D(const dfloat a, const int i, dfloat& Pr){ + Pr = GradJacobiP(a,0,0,i); } // ------------------------------------------------------------------------ // 1D VANDERMONDE MATRICES // ------------------------------------------------------------------------ -void mesh_t::Vandermonde1D(int _N, int Npoints, dfloat *_r, dfloat *V){ +void mesh_t::Vandermonde1D(const int _N, + const memory _r, + memory& V){ - int _Np = (_N+1); + const int _Np = (_N+1); + const int Npoints = _r.length(); + V.malloc(Npoints*_Np); for(int n=0; n _r, + memory& Vr){ - int _Np = (_N+1); + const int _Np = (_N+1); + const int Npoints = _r.length(); + Vr.malloc(Npoints*_Np); for(int n=0; n V, + memory& _MM){ - // masMatrix = inv(V')*inv(V) = inv(V*V') + // massMatrix = inv(V')*inv(V) = inv(V*V') + _MM.malloc(_Np*_Np); for(int n=0;n<_Np;++n){ for(int m=0;m<_Np;++m){ dfloat res = 0; @@ -93,71 +108,74 @@ void mesh_t::MassMatrix1D(int _Np, dfloat *V, dfloat *_MM){ _MM[n*_Np + m] = res; } } - matrixInverse(_Np, _MM); + linAlg_t::matrixInverse(_Np, _MM); } -void mesh_t::Dmatrix1D(int _N, int NpointsIn, dfloat *_rIn, - int NpointsOut, dfloat *_rOut, dfloat *_Dr){ +void mesh_t::Dmatrix1D(const int _N, + const memory _rIn, + const memory _rOut, + memory& _Dr){ - // need NpointsIn = (_N+1) - if (NpointsIn != _N+1) - LIBP_ABORT(string("Invalid Differentiation operator requested.")) - int _Np = _N+1; + const int _Np = _N+1; + const int NpointsIn = _rIn.length(); + const int NpointsOut = _rOut.length(); - dfloat *V = (dfloat *) calloc(NpointsIn*_Np, sizeof(dfloat)); - dfloat *Vr = (dfloat *) calloc(NpointsOut*_Np, sizeof(dfloat)); + // need NpointsIn = (_N+1) + LIBP_ABORT("Invalid Differentiation operator requested.", + NpointsIn != _N+1); - Vandermonde1D(_N, NpointsIn, _rIn, V); - GradVandermonde1D(_N, NpointsOut, _rOut, Vr); + memory V; + memory Vr; + Vandermonde1D(_N, _rIn, V); + GradVandermonde1D(_N, _rOut, Vr); //D = Vr/V - matrixRightSolve(NpointsOut, _Np, Vr, _Np, _Np, V, _Dr); - - free(V); - free(Vr); + _Dr.malloc(NpointsOut*_Np); + linAlg_t::matrixRightSolve(NpointsOut, _Np, Vr, _Np, _Np, V, _Dr); } -void mesh_t::InterpolationMatrix1D(int _N, - int NpointsIn, dfloat *rIn, - int NpointsOut, dfloat *rOut, - dfloat *I){ +void mesh_t::InterpolationMatrix1D(const int _N, + const memory _rIn, + const memory _rOut, + memory& I){ - // need NpointsIn = (_N+1) - if (NpointsIn != _N+1) - LIBP_ABORT(string("Invalid Interplation operator requested.")) + const int _Np = _N+1; + const int NpointsIn = _rIn.length(); + const int NpointsOut = _rOut.length(); - dfloat *VIn = (dfloat*) malloc(NpointsIn*(_N+1)*sizeof(dfloat)); - dfloat *VOut= (dfloat*) malloc(NpointsOut*(_N+1)*sizeof(dfloat)); - - Vandermonde1D(_N, NpointsIn, rIn, VIn); - Vandermonde1D(_N, NpointsOut, rOut, VOut); + // need NpointsIn = (_N+1) + LIBP_ABORT("Invalid Interplation operator requested.", + NpointsIn != _N+1); - matrixRightSolve(NpointsOut, _N+1, VOut, NpointsIn, _N+1, VIn, I); + memory VIn; + memory VOut; + Vandermonde1D(_N, _rIn, VIn); + Vandermonde1D(_N, _rOut, VOut); - free(VIn); free(VOut); + I.malloc(NpointsIn*NpointsOut); + linAlg_t::matrixRightSolve(NpointsOut, _Np, VOut, + NpointsIn, _Np, VIn, I); } -void mesh_t::DegreeRaiseMatrix1D(int Nc, int Nf, dfloat *P){ - - int Nqc = Nc+1; - int Nqf = Nf+1; - - dfloat *rc = (dfloat *) malloc(Nqc*sizeof(dfloat)); - dfloat *rf = (dfloat *) malloc(Nqf*sizeof(dfloat)); +void mesh_t::DegreeRaiseMatrix1D(const int Nc, const int Nf, + memory& P){ + memory rc; + memory rf; Nodes1D(Nc, rc); Nodes1D(Nf, rf); - InterpolationMatrix1D(Nc, Nqc, rc, Nqf, rf, P); - - free(rc); free(rf); + InterpolationMatrix1D(Nc, rc, rf, P); } -void mesh_t::CubatureWeakDmatrix1D(int _Nq, int _cubNq, - dfloat *_cubProject, dfloat *_cubD, dfloat *_cubPDT){ +void mesh_t::CubatureWeakDmatrix1D(const int _Nq, const int _cubNq, + const memory _cubProject, + const memory _cubD, + memory& _cubPDT){ // cubPDT = cubProject*cubD'; + _cubPDT.malloc(_Nq*_cubNq); for(int n=0;n<_Nq;++n){ for(int m=0;m<_cubNq;++m){ _cubPDT[n*_cubNq+m] = 0.0; @@ -171,29 +189,30 @@ void mesh_t::CubatureWeakDmatrix1D(int _Nq, int _cubNq, // ------------------------------------------------------------------------ // 1D JACOBI POLYNOMIALS // ------------------------------------------------------------------------ -static dfloat mygamma(dfloat x){ +static dfloat mygamma(const dfloat x){ dfloat lgam = lgamma(x); dfloat gam = signgam*exp(lgam); return gam; } -dfloat mesh_t::JacobiP(dfloat a, dfloat alpha, dfloat beta, int _N){ +dfloat mesh_t::JacobiP(const dfloat a, const dfloat alpha, + const dfloat beta, const int _N){ - dfloat ax = a; + const dfloat ax = a; - dfloat *P = (dfloat *) calloc((_N+1), sizeof(dfloat)); + memory P(_N+1); // Zero order - dfloat gamma0 = pow(2,(alpha+beta+1))/(alpha+beta+1)*mygamma(1+alpha)*mygamma(1+beta)/mygamma(1+alpha+beta); - dfloat p0 = 1.0/sqrt(gamma0); + const dfloat gamma0 = pow(2,(alpha+beta+1))/(alpha+beta+1)*mygamma(1+alpha)*mygamma(1+beta)/mygamma(1+alpha+beta); + const dfloat p0 = 1.0/sqrt(gamma0); - if (_N==0){ free(P); return p0;} + if (_N==0){ return p0;} P[0] = p0; // first order - dfloat gamma1 = (alpha+1)*(beta+1)/(alpha+beta+3)*gamma0; - dfloat p1 = ((alpha+beta+2)*ax/2 + (alpha-beta)/2)/sqrt(gamma1); - if (_N==1){free(P); return p1;} + const dfloat gamma1 = (alpha+1)*(beta+1)/(alpha+beta+3)*gamma0; + const dfloat p1 = ((alpha+beta+2)*ax/2 + (alpha-beta)/2)/sqrt(gamma1); + if (_N==1){ return p1;} P[1] = p1; @@ -207,13 +226,11 @@ dfloat mesh_t::JacobiP(dfloat a, dfloat alpha, dfloat beta, int _N){ P[i+1] = 1./anew*( -aold*P[i-1] + (ax-bnew)*P[i]); aold =anew; } - - dfloat pN = P[_N]; - free(P); - return pN; + return P[_N]; } -dfloat mesh_t::GradJacobiP(dfloat a, dfloat alpha, dfloat beta, int _N){ +dfloat mesh_t::GradJacobiP(const dfloat a, const dfloat alpha, + const dfloat beta, const int _N){ dfloat PNr = 0; @@ -226,53 +243,74 @@ dfloat mesh_t::GradJacobiP(dfloat a, dfloat alpha, dfloat beta, int _N){ // ------------------------------------------------------------------------ // 1D GAUSS-LEGENDRE-LOBATTO QUADRATURE // ------------------------------------------------------------------------ -void mesh_t::JacobiGLL(int _N, dfloat *_x, dfloat *_w){ +void mesh_t::JacobiGLL(const int _N, memory& _x){ + + _x.malloc(_N+1); _x[0] = -1.; _x[_N] = 1.; if(_N>1){ - dfloat *wtmp = (dfloat*) calloc(_N-1, sizeof(dfloat)); - JacobiGQ(1,1, _N-2, _x+1, wtmp); - free(wtmp); + memory wtmp; + memory xp1 = _x + 1; + JacobiGQ(1,1, _N-2, xp1, wtmp); } +} - if (_w!=NULL) { - int _Np = _N+1; - dfloat *_MM = (dfloat*) malloc(_Np*_Np*sizeof(dfloat)); - dfloat *V = (dfloat*) malloc(_Np*_Np*sizeof(dfloat)); +void mesh_t::JacobiGLL(const int _N, + memory& _x, + memory& _w){ - Vandermonde1D(_N, _N+1, _x, V); - MassMatrix1D(_N+1, V, _MM); + _x.malloc(_N+1); + _w.malloc(_N+1); - // use weights from mass lumping - for(int n=0;n<=_N;++n){ - dfloat res = 0; - for(int m=0;m<=_N;++m){ - res += _MM[n*(_N+1)+m]; - } - _w[n] = res; + _x[0] = -1.; + _x[_N] = 1.; + + if(_N>1){ + memory wtmp; + memory xp1 = _x + 1; + JacobiGQ(1,1, _N-2, xp1, wtmp); + } + + memory V; + memory _MM; + Vandermonde1D(_N, _x, V); + MassMatrix1D(_N+1, V, _MM); + + // use weights from mass lumping + for(int n=0;n<=_N;++n){ + dfloat res = 0; + for(int m=0;m<=_N;++m){ + res += _MM[n*(_N+1)+m]; } + _w[n] = res; } } // ------------------------------------------------------------------------ // 1D GAUSS QUADRATURE // ------------------------------------------------------------------------ -void mesh_t::JacobiGQ(dfloat alpha, dfloat beta, int _N, dfloat *_x, dfloat *_w){ +void mesh_t::JacobiGQ(const dfloat alpha, const dfloat beta, + const int _N, + memory& _x, + memory& _w){ // function NGQ = JacobiGQ(alpha,beta,_N, _x, _w) // Purpose: Compute the _N'th order Gauss quadrature points, _x, // and weights, _w, associated with the Jacobi // polynomial, of type (alpha,beta) > -1 ( <> -0.5). + if (_x.length()==0) _x.malloc(_N+1); + if (_w.length()==0) _w.malloc(_N+1); + if (_N==0){ _x[0] = (alpha-beta)/(alpha+beta+2); _w[0] = 2; } // Form symmetric matrix from recurrence. - dfloat *J = (dfloat*) calloc((_N+1)*(_N+1), sizeof(dfloat)); - dfloat *h1 = (dfloat*) calloc(_N+1, sizeof(dfloat)); + memory J((_N+1)*(_N+1), 0.0); + memory h1(_N+1); for(int n=0;n<=_N;++n){ h1[n] = 2*n+alpha+beta; @@ -301,12 +339,11 @@ void mesh_t::JacobiGQ(dfloat alpha, dfloat beta, int _N, dfloat *_x, dfloat *_w) // Compute quadrature by eigenvalue solve // [V,D] = eig(J); - dfloat *WR = (dfloat*) calloc(_N+1, sizeof(dfloat)); - dfloat *WI = (dfloat*) calloc(_N+1, sizeof(dfloat)); - dfloat *VR = (dfloat*) calloc((_N+1)*(_N+1), sizeof(dfloat)); + memory WI(_N+1); + memory VR((_N+1)*(_N+1)); // _x = diag(D); - matrixEigenVectors(_N+1, J, VR, _x, WI); + linAlg_t::matrixEigenVectors(_N+1, J, VR, _x, WI); //_w = (V(1,:)').^2*2^(alpha+beta+1)/(alpha+beta+1)*gamma(alpha+1)*.gamma(beta+1)/gamma(alpha+beta+1); for(int n=0;n<=_N;++n){ @@ -332,10 +369,6 @@ void mesh_t::JacobiGQ(dfloat alpha, dfloat beta, int _N, dfloat *_x, dfloat *_w) printf("zgl[%d] = % e, wgl[%d] = % e\n", n, _x[0][n], n, _w[0][n]); } #endif - - free(WR); - free(WI); - free(VR); } /* @@ -483,3 +516,5 @@ void meshCubatureWeakDmatrices1D(int _N, int _Np, dfloat *V, free(cubVr); } */ + +} //namespace libp diff --git a/libs/mesh/meshBasisHex3D.cpp b/libs/mesh/meshBasisHex3D.cpp index 696ecf6e2..361e9f8eb 100644 --- a/libs/mesh/meshBasisHex3D.cpp +++ b/libs/mesh/meshBasisHex3D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim WarburtonTim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,17 +25,26 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh3D.hpp" + +namespace libp { // ------------------------------------------------------------------------ // HEX 3D NODES // ------------------------------------------------------------------------ -void mesh_t::NodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t){ - int _Nq = _N+1; - - dfloat *r1D = (dfloat*) malloc(_Nq*sizeof(dfloat)); +void mesh_t::NodesHex3D(const int _N, + memory& _r, + memory& _s, + memory& _t){ + const int _Nq = _N+1; + const int _Np = _Nq*_Nq*_Nq; + + memory r1D; JacobiGLL(_N, r1D); //Gauss-Legendre-Lobatto nodes + _r.malloc(_Np); + _s.malloc(_Np); + _t.malloc(_Np); + //Tensor product for (int k=0;k<_Nq;k++) { for (int j=0;j<_Nq;j++) { @@ -46,11 +55,13 @@ void mesh_t::NodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t){ } } } - - free(r1D); } -void mesh_t::FaceNodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_faceNodes){ +void mesh_t::FaceNodesHex3D(const int _N, + const memory _r, + const memory _s, + const memory _t, + memory& _faceNodes){ int _Nq = _N+1; int _Nfp = _Nq*_Nq; int _Np = _Nq*_Nq*_Nq; @@ -64,25 +75,30 @@ void mesh_t::FaceNodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_fa const dfloat NODETOL = 1000.*deps; + _faceNodes.malloc(6*_Nfp); for (int n=0;n<_Np;n++) { - if(fabs(_t[n]+1) _r, + const memory _s, + const memory _t, + memory& _vertexNodes){ + const int _Nq = _N+1; + const int _Np = _Nq*_Nq*_Nq; dfloat deps = 1.; while((1.+deps)>1.) @@ -90,6 +106,7 @@ void mesh_t::VertexNodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_ const dfloat NODETOL = 1000.*deps; + _vertexNodes.malloc(8); for(int n=0;n<_Np;++n){ if( (_r[n]+1)*(_r[n]+1)+(_s[n]+1)*(_s[n]+1)+(_t[n]+1)*(_t[n]+1) _r, + const memory _s, + const memory _t, + const memory _faceNodes, + const memory _faceVertices, + memory& R){ + + const int _Nfaces = 6; + const int _Nverts = 8; + const int _NfaceVertices = 4; + + const int _Nfp = _faceNodes.length()/_Nfaces; + + const dfloat NODETOL = 1.0e-5; + + dfloat V0[4][2] = {{-1.0,-1.0},{ 1.0,-1.0},{ 1.0, 1.0},{-1.0, 1.0}}; + dfloat V1[4][2] = {{-1.0,-1.0},{-1.0, 1.0},{ 1.0, 1.0},{ 1.0,-1.0}}; + + dfloat EX0[_Nverts], EY0[_Nverts]; + dfloat EX1[_Nverts], EY1[_Nverts]; + + memory x0(_Nfp); + memory y0(_Nfp); + + memory x1(_Nfp); + memory y1(_Nfp); + + R.malloc(_Nfaces*_Nfaces*_NfaceVertices*_Nfp); + + for (int fM=0;fM<_Nfaces;fM++) { + + for (int v=0;v<_Nverts;v++) { + EX0[v] = 0.0; EY0[v] = 0.0; + } + //setup top element with face fM on the bottom + for (int v=0;v<_NfaceVertices;v++) { + int fv = _faceVertices[fM*_NfaceVertices + v]; + EX0[fv] = V0[v][0]; EY0[fv] = V0[v][1]; + } + + for(int n=0;n<_Nfp;++n){ /* for each face node */ + const int fn = _faceNodes[fM*_Nfp+n]; + + /* (r,s,t) coordinates of interpolation nodes*/ + dfloat rn = _r[fn]; + dfloat sn = _s[fn]; + dfloat tn = _t[fn]; + + /* physical coordinate of interpolation node */ + x0[n] = + +0.125*(1-rn)*(1-sn)*(1-tn)*EX0[0] + +0.125*(1+rn)*(1-sn)*(1-tn)*EX0[1] + +0.125*(1+rn)*(1+sn)*(1-tn)*EX0[2] + +0.125*(1-rn)*(1+sn)*(1-tn)*EX0[3] + +0.125*(1-rn)*(1-sn)*(1+tn)*EX0[4] + +0.125*(1+rn)*(1-sn)*(1+tn)*EX0[5] + +0.125*(1+rn)*(1+sn)*(1+tn)*EX0[6] + +0.125*(1-rn)*(1+sn)*(1+tn)*EX0[7]; + + y0[n] = + +0.125*(1-rn)*(1-sn)*(1-tn)*EY0[0] + +0.125*(1+rn)*(1-sn)*(1-tn)*EY0[1] + +0.125*(1+rn)*(1+sn)*(1-tn)*EY0[2] + +0.125*(1-rn)*(1+sn)*(1-tn)*EY0[3] + +0.125*(1-rn)*(1-sn)*(1+tn)*EY0[4] + +0.125*(1+rn)*(1-sn)*(1+tn)*EY0[5] + +0.125*(1+rn)*(1+sn)*(1+tn)*EY0[6] + +0.125*(1-rn)*(1+sn)*(1+tn)*EY0[7]; + } + + for (int fP=0;fP<_Nfaces;fP++) { /*For each neighbor face */ + for (int rot=0;rot<_NfaceVertices;rot++) { /* For each face rotation */ + // Zero vertices + for (int v=0;v<_Nverts;v++) { + EX1[v] = 0.0; EY1[v] = 0.0; + } + //setup bottom element with face fP on the top + for (int v=0;v<_NfaceVertices;v++) { + int fv = _faceVertices[fP*_NfaceVertices + ((v+rot)%_NfaceVertices)]; + EX1[fv] = V1[v][0]; EY1[fv] = V1[v][1]; + } + + for(int n=0;n<_Nfp;++n){ /* for each node */ + const int fn = _faceNodes[fP*_Nfp+n]; + + /* (r,s,t) coordinates of interpolation nodes*/ + dfloat rn = _r[fn]; + dfloat sn = _s[fn]; + dfloat tn = _t[fn]; + + /* physical coordinate of interpolation node */ + x1[n] = 0.125*(1-rn)*(1-sn)*(1-tn)*EX1[0] + +0.125*(1+rn)*(1-sn)*(1-tn)*EX1[1] + +0.125*(1+rn)*(1+sn)*(1-tn)*EX1[2] + +0.125*(1-rn)*(1+sn)*(1-tn)*EX1[3] + +0.125*(1-rn)*(1-sn)*(1+tn)*EX1[4] + +0.125*(1+rn)*(1-sn)*(1+tn)*EX1[5] + +0.125*(1+rn)*(1+sn)*(1+tn)*EX1[6] + +0.125*(1-rn)*(1+sn)*(1+tn)*EX1[7]; + + y1[n] = 0.125*(1-rn)*(1-sn)*(1-tn)*EY1[0] + +0.125*(1+rn)*(1-sn)*(1-tn)*EY1[1] + +0.125*(1+rn)*(1+sn)*(1-tn)*EY1[2] + +0.125*(1-rn)*(1+sn)*(1-tn)*EY1[3] + +0.125*(1-rn)*(1-sn)*(1+tn)*EY1[4] + +0.125*(1+rn)*(1-sn)*(1+tn)*EY1[5] + +0.125*(1+rn)*(1+sn)*(1+tn)*EY1[6] + +0.125*(1-rn)*(1+sn)*(1+tn)*EY1[7]; + } + + /* for each node on this face find the neighbor node */ + for(int n=0;n<_Nfp;++n){ + const dfloat xM = x0[n]; + const dfloat yM = y0[n]; + + int m=0; + for(;m<_Nfp;++m){ /* for each neighbor node */ + const dfloat xP = x1[m]; + const dfloat yP = y1[m]; + + /* distance between target and neighbor node */ + const dfloat dist = pow(xM-xP,2) + pow(yM-yP,2); + + /* if neighbor node is close to target, match */ + if(distNODETOL); + } + } + } + } +} + +void mesh_t::EquispacedNodesHex3D(const int _N, + memory& _r, + memory& _s, + memory& _t){ + const int _Nq = _N+1; + const int _Np = _Nq*_Nq*_Nq; //Equispaced 1D nodes - dfloat *r1D = (dfloat*) malloc(_Nq*sizeof(dfloat)); - dfloat dr = 2.0/_N; - for (int i=0;i<_Nq;i++) r1D[i] = -1.0 + i*dr; + memory r1D; + EquispacedNodes1D(_N, r1D); //Tensor product + _r.malloc(_Np); + _s.malloc(_Np); + _t.malloc(_Np); for (int k=0;k<_Nq;k++) { for (int j=0;j<_Nq;j++) { for (int i=0;i<_Nq;i++) { @@ -128,13 +303,14 @@ void mesh_t::EquispacedNodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t){ } } } - - free(r1D); } -void mesh_t::EquispacedEToVHex3D(int _N, int *_EToV){ - int _Nq = _N+1; - int _Nverts = 4; +void mesh_t::EquispacedEToVHex3D(const int _N, memory& _EToV){ + const int _Nq = _N+1; + const int _Nelements = 6*_N*_N*_N; + const int _Nverts = 4; + + _EToV.malloc(_Nelements*_Nverts); //Tensor product int cnt=0; @@ -187,9 +363,12 @@ void mesh_t::EquispacedEToVHex3D(int _N, int *_EToV){ } } -void mesh_t::SEMFEMEToVHex3D(int _N, int *_EToV){ - int _Nq = _N+1; - int _Nverts = 8; +void mesh_t::SEMFEMEToVHex3D(const int _N, memory& _EToV){ + const int _Nq = _N+1; + const int _Nelements = _N*_N*_N; + const int _Nverts = 8; + + _EToV.malloc(_Nelements*_Nverts); //Tensor product int cnt=0; @@ -213,48 +392,68 @@ void mesh_t::SEMFEMEToVHex3D(int _N, int *_EToV){ // ------------------------------------------------------------------------ // ORTHONORMAL BASIS POLYNOMIALS // ------------------------------------------------------------------------ -void mesh_t::OrthonormalBasisHex3D(dfloat a, dfloat b, dfloat c, int i, int j, int k, dfloat *P){ - *P = JacobiP(a,0,0,i)*JacobiP(b,0,0,j)*JacobiP(c,0,0,k); +void mesh_t::OrthonormalBasisHex3D(const dfloat a, const dfloat b, const dfloat c, + const int i, const int j, const int k, + dfloat& P){ + P = JacobiP(a,0,0,i)*JacobiP(b,0,0,j)*JacobiP(c,0,0,k); } -void mesh_t::GradOrthonormalBasisHex3D(dfloat a, dfloat b, dfloat c, int i, int j, int k, dfloat *Pr, dfloat *Ps, dfloat *Pt){ - *Pr = GradJacobiP(a,0,0,i)*JacobiP(b,0,0,j)*JacobiP(c,0,0,k); - *Ps = JacobiP(a,0,0,i)*GradJacobiP(b,0,0,j)*JacobiP(c,0,0,k); - *Pt = JacobiP(a,0,0,i)*JacobiP(b,0,0,j)*GradJacobiP(c,0,0,k); +void mesh_t::GradOrthonormalBasisHex3D(const dfloat a, const dfloat b, const dfloat c, + const int i, const int j, const int k, + dfloat& Pr, dfloat& Ps, dfloat& Pt){ + Pr = GradJacobiP(a,0,0,i)*JacobiP(b,0,0,j)*JacobiP(c,0,0,k); + Ps = JacobiP(a,0,0,i)*GradJacobiP(b,0,0,j)*JacobiP(c,0,0,k); + Pt = JacobiP(a,0,0,i)*JacobiP(b,0,0,j)*GradJacobiP(c,0,0,k); } // ------------------------------------------------------------------------ // 2D VANDERMONDE MATRICES // ------------------------------------------------------------------------ -void mesh_t::VandermondeHex3D(int _N, int Npoints, dfloat *_r, dfloat *_s, dfloat *_t, dfloat *V){ +void mesh_t::VandermondeHex3D(const int _N, + const memory _r, + const memory _s, + const memory _t, + memory& V){ - int _Nq = _N+1; - int _Np = _Nq*_Nq*_Nq; + const int _Nq = _N+1; + const int _Np = _Nq*_Nq*_Nq; + const int Npoints = _r.length(); + V.malloc(Npoints*_Np); for(int n=0; n _r, + const memory _s, + const memory _t, + memory& Vr, + memory& Vs, + memory& Vt){ + + const int _Nq = _N+1; + const int _Np = _Nq*_Nq*_Nq; + const int Npoints = _r.length(); + + Vr.malloc(Npoints*_Np); + Vs.malloc(Npoints*_Np); + Vt.malloc(Npoints*_Np); for(int n=0; n V, + memory& _MM){ - // masMatrix = inv(V')*inv(V) = inv(V*V') + // massMatrix = inv(V')*inv(V) = inv(V*V') + _MM.malloc(_Np*_Np); for(int n=0;n<_Np;++n){ for(int m=0;m<_Np;++m){ dfloat res = 0; @@ -276,15 +478,18 @@ void mesh_t::MassMatrixHex3D(int _Np, dfloat *V, dfloat *_MM){ _MM[n*_Np + m] = res; } } - matrixInverse(_Np, _MM); + linAlg_t::matrixInverse(_Np, _MM); } -void mesh_t::LumpedMassMatrixHex3D(int _N, dfloat *_gllw, dfloat *_MM){ +void mesh_t::LumpedMassMatrixHex3D(const int _N, + const memory _gllw, + memory& _MM){ - int _Nq = _N+1; - int _Np = _Nq*_Nq*_Nq; + const int _Nq = _N+1; + const int _Np = _Nq*_Nq*_Nq; // LumpedMassMatrix = gllw \ctimes gllw \ctimes gllw + _MM.malloc(_Np*_Np, 0.0); for(int k=0;k<_Nq;++k){ for(int n=0;n<_Nq;++n){ for(int m=0;m<_Nq;++m){ @@ -295,12 +500,15 @@ void mesh_t::LumpedMassMatrixHex3D(int _N, dfloat *_gllw, dfloat *_MM){ } } -void mesh_t::invLumpedMassMatrixHex3D(int _N, dfloat *_gllw, dfloat *_invMM){ +void mesh_t::invLumpedMassMatrixHex3D(const int _N, + const memory _gllw, + memory& _invMM){ - int _Nq = _N+1; - int _Np = _Nq*_Nq*_Nq; + const int _Nq = _N+1; + const int _Np = _Nq*_Nq*_Nq; // invLumpedMassMatrix = invgllw \ctimes invgllw + _invMM.malloc(_Np*_Np, 0.0); for(int k=0;k<_Nq;++k){ for(int n=0;n<_Nq;++n){ for(int m=0;m<_Nq;++m){ @@ -311,47 +519,56 @@ void mesh_t::invLumpedMassMatrixHex3D(int _N, dfloat *_gllw, dfloat *_invMM){ } } -void mesh_t::DmatrixHex3D(int _N, int Npoints, dfloat *_r, dfloat *_s, dfloat *_t, - dfloat *_Dr, dfloat *_Ds, dfloat *_Dt){ - - int _Nq = _N+1; - int _Np = _Nq*_Nq*_Nq; +void mesh_t::DmatrixHex3D(const int _N, + const memory _r, + const memory _s, + const memory _t, + memory& _D){ - dfloat *V = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat)); - dfloat *Vr = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat)); - dfloat *Vs = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat)); - dfloat *Vt = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat)); + const int _Nq = _N+1; + const int _Np = _Nq*_Nq*_Nq; - VandermondeHex3D(_N, Npoints, _r, _s, _t, V); - GradVandermondeHex3D(_N, Npoints, _r, _s, _t, Vr, Vs, Vt); + memory V, Vr, Vs, Vt; + VandermondeHex3D(_N, _r, _s, _t, V); + GradVandermondeHex3D(_N, _r, _s, _t, Vr, Vs, Vt); //Dr = Vr/V, Ds = Vs/V, Dt = Vt/V - matrixRightSolve(_Np, _Np, Vr, _Np, _Np, V, _Dr); - matrixRightSolve(_Np, _Np, Vs, _Np, _Np, V, _Ds); - matrixRightSolve(_Np, _Np, Vt, _Np, _Np, V, _Dt); - - free(V); free(Vr); free(Vs); free(Vt); + _D.malloc(3*_Np*_Np); + memory _Dr = _D + 0*_Np*_Np; + memory _Ds = _D + 1*_Np*_Np; + memory _Dt = _D + 2*_Np*_Np; + linAlg_t::matrixRightSolve(_Np, _Np, Vr, _Np, _Np, V, _Dr); + linAlg_t::matrixRightSolve(_Np, _Np, Vs, _Np, _Np, V, _Ds); + linAlg_t::matrixRightSolve(_Np, _Np, Vt, _Np, _Np, V, _Dt); } -void mesh_t::InterpolationMatrixHex3D(int _N, - int NpointsIn, dfloat *rIn, dfloat *sIn, dfloat *tIn, - int NpointsOut, dfloat *rOut, dfloat *sOut, dfloat *tOut, - dfloat *I){ +void mesh_t::InterpolationMatrixHex3D(const int _N, + const memory rIn, + const memory sIn, + const memory tIn, + const memory rOut, + const memory sOut, + const memory tOut, + memory& I){ - int _Nq = _N+1; - int _Np = _Nq*_Nq*_Nq; + const int _Nq = _N+1; + const int _Np = _Nq*_Nq*_Nq; - // need NpointsIn = _Np - if (NpointsIn != _Np) - LIBP_ABORT(string("Invalid Interplation operator requested.")) + const int NpointsIn = rIn.length(); + const int NpointsOut = rOut.length(); - dfloat *VIn = (dfloat*) malloc(NpointsIn*_Np*sizeof(dfloat)); - dfloat *VOut= (dfloat*) malloc(NpointsOut*_Np*sizeof(dfloat)); + // need NpointsIn = _Np + LIBP_ABORT("Invalid Interplation operator requested.", + NpointsIn != _Np); - VandermondeHex3D(_N, NpointsIn, rIn, sIn, tIn, VIn); - VandermondeHex3D(_N, NpointsOut, rOut, sOut, tOut, VOut); + memory VIn; + memory VOut; + VandermondeHex3D(_N, rIn, sIn, tIn, VIn); + VandermondeHex3D(_N, rOut, sOut, tOut, VOut); - matrixRightSolve(NpointsOut, _Np, VOut, NpointsIn, _Np, VIn, I); + I.malloc(NpointsIn*NpointsOut); + linAlg_t::matrixRightSolve(NpointsOut, _Np, VOut, + NpointsIn, _Np, VIn, I); +} - free(VIn); free(VOut); -} \ No newline at end of file +} //namespace libp diff --git a/libs/mesh/meshBasisQuad2D.cpp b/libs/mesh/meshBasisQuad2D.cpp index bba5f10ec..932fd1460 100644 --- a/libs/mesh/meshBasisQuad2D.cpp +++ b/libs/mesh/meshBasisQuad2D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim WarburtonTim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,17 +25,24 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh2D.hpp" + +namespace libp { // ------------------------------------------------------------------------ // QUAD 2D NODES // ------------------------------------------------------------------------ -void mesh_t::NodesQuad2D(int _N, dfloat *_r, dfloat *_s){ - int _Nq = _N+1; +void mesh_t::NodesQuad2D(const int _N, + memory& _r, + memory& _s){ + const int _Nq = _N+1; + const int _Np = _Nq*_Nq; - dfloat *r1D = (dfloat*) malloc(_Nq*sizeof(dfloat)); + memory r1D; JacobiGLL(_N, r1D); //Gauss-Legendre-Lobatto nodes + _r.malloc(_Np); + _s.malloc(_Np); + //Tensor product for (int j=0;j<_Nq;j++) { for (int i=0;i<_Nq;i++) { @@ -43,14 +50,15 @@ void mesh_t::NodesQuad2D(int _N, dfloat *_r, dfloat *_s){ _s[i+j*_Nq] = r1D[j]; } } - - free(r1D); } -void mesh_t::FaceNodesQuad2D(int _N, dfloat *_r, dfloat *_s, int *_faceNodes){ - int _Nq = _N+1; - int _Nfp = _Nq; - int _Np = _Nq*_Nq; +void mesh_t::FaceNodesQuad2D(const int _N, + const memory _r, + const memory _s, + memory& _faceNodes){ + const int _Nq = _N+1; + const int _Nfp = _Nq; + const int _Np = _Nq*_Nq; int cnt[4]; for (int i=0;i<4;i++) cnt[i]=0; @@ -61,6 +69,7 @@ void mesh_t::FaceNodesQuad2D(int _N, dfloat *_r, dfloat *_s, int *_faceNodes){ const dfloat NODETOL = 1000.*deps; + _faceNodes.malloc(4*_Nfp); for (int n=0;n<_Np;n++) { if(fabs(_s[n]+1) _r, + const memory _s, + memory& _vertexNodes){ + const int _Nq = _N+1; + const int _Np = _Nq*_Nq; dfloat deps = 1.; while((1.+deps)>1.) @@ -83,6 +95,7 @@ void mesh_t::VertexNodesQuad2D(int _N, dfloat *_r, dfloat *_s, int *_vertexNodes const dfloat NODETOL = 1000.*deps; + _vertexNodes.malloc(4); for(int n=0;n<_Np;++n){ if( (_r[n]+1)*(_r[n]+1)+(_s[n]+1)*(_s[n]+1) _r, + const memory _s, + const memory _faceNodes, + const memory _faceVertices, + memory& R){ + + const int _Nfaces = 4; + const int _Nverts = 4; + const int _NfaceVertices = 2; + + const int _Nfp = _faceNodes.length()/_Nfaces; + + const dfloat NODETOL = 1.0e-5; + + dfloat V[2] = {-1.0, 1.0}; + + dfloat EX0[_Nverts]; + dfloat EX1[_Nverts]; + + memory x0(_Nfp); + memory x1(_Nfp); + + R.malloc(_Nfaces*_Nfaces*_NfaceVertices*_Nfp); + + for (int fM=0;fM<_Nfaces;fM++) { + + for (int v=0;v<_Nverts;v++) { + EX0[v] = 0.0; + } + //setup top element with face fM on the bottom + for (int v=0;v<_NfaceVertices;v++) { + int fv = _faceVertices[fM*_NfaceVertices + v]; + EX0[fv] = V[v]; + } + + for(int n=0;n<_Nfp;++n){ /* for each face node */ + const int fn = _faceNodes[fM*_Nfp+n]; + + /* (r,s) coordinates of interpolation nodes*/ + dfloat rn = _r[fn]; + dfloat sn = _s[fn]; + + /* physical coordinate of interpolation node */ + x0[n] = 0.25*(1-rn)*(1-sn)*EX0[0] + +0.25*(1+rn)*(1-sn)*EX0[1] + +0.25*(1+rn)*(1+sn)*EX0[2] + +0.25*(1-rn)*(1+sn)*EX0[3]; + } + + for (int fP=0;fP<_Nfaces;fP++) { /*For each neighbor face */ + for (int rot=0;rot<_NfaceVertices;rot++) { /* For each face rotation */ + // Zero vertices + for (int v=0;v<_Nverts;v++) { + EX1[v] = 0.0; + } + //setup bottom element with face fP on the top + for (int v=0;v<_NfaceVertices;v++) { + int fv = _faceVertices[fP*_NfaceVertices + ((v+rot)%_NfaceVertices)]; + EX1[fv] = V[v]; + } + + for(int n=0;n<_Nfp;++n){ /* for each node */ + const int fn = _faceNodes[fP*_Nfp+n]; + + /* (r,s,t) coordinates of interpolation nodes*/ + dfloat rn = _r[fn]; + dfloat sn = _s[fn]; + + /* physical coordinate of interpolation node */ + x1[n] = 0.25*(1-rn)*(1-sn)*EX1[0] + +0.25*(1+rn)*(1-sn)*EX1[1] + +0.25*(1+rn)*(1+sn)*EX1[2] + +0.25*(1-rn)*(1+sn)*EX1[3]; + } + + /* for each node on this face find the neighbor node */ + for(int n=0;n<_Nfp;++n){ + const dfloat xM = x0[n]; + + int m=0; + for(;m<_Nfp;++m){ /* for each neighbor node */ + const dfloat xP = x1[m]; + + /* distance between target and neighbor node */ + const dfloat dist = pow(xM-xP,2); + + /* if neighbor node is close to target, match */ + if(distNODETOL); + } + } + } + } +} + +void mesh_t::EquispacedNodesQuad2D(const int _N, + memory& _r, + memory& _s){ + const int _Nq = _N+1; + const int _Np = _Nq*_Nq; //Equispaced 1D nodes - dfloat *r1D = (dfloat*) malloc(_Nq*sizeof(dfloat)); + memory r1D; EquispacedNodes1D(_N, r1D); //Tensor product + _r.malloc(_Np); + _s.malloc(_Np); for (int j=0;j<_Nq;j++) { for (int i=0;i<_Nq;i++) { _r[i+j*_Nq] = r1D[i]; _s[i+j*_Nq] = r1D[j]; } } - - free(r1D); } -void mesh_t::EquispacedEToVQuad2D(int _N, int *_EToV){ - int _Nq = _N+1; - int _Nverts = 3; +void mesh_t::EquispacedEToVQuad2D(const int _N, memory& _EToV){ + const int _Nq = _N+1; + const int _Nelements = 2*_N*_N; + const int _Nverts = 3; + + _EToV.malloc(_Nelements*_Nverts); //Tensor product int cnt=0; @@ -134,9 +267,12 @@ void mesh_t::EquispacedEToVQuad2D(int _N, int *_EToV){ } } -void mesh_t::SEMFEMEToVQuad2D(int _N, int *_EToV){ - int _Nq = _N+1; - int _Nverts = 4; +void mesh_t::SEMFEMEToVQuad2D(const int _N, memory& _EToV){ + const int _Nq = _N+1; + const int _Nelements = _N*_N; + const int _Nverts = 4; + + _EToV.malloc(_Nelements*_Nverts); //Tensor product int cnt=0; @@ -154,44 +290,60 @@ void mesh_t::SEMFEMEToVQuad2D(int _N, int *_EToV){ // ------------------------------------------------------------------------ // ORTHONORMAL BASIS POLYNOMIALS // ------------------------------------------------------------------------ -void mesh_t::OrthonormalBasisQuad2D(dfloat a, dfloat b, int i, int j, dfloat *P){ - *P = JacobiP(a,0,0,i)*JacobiP(b,0,0,j); +void mesh_t::OrthonormalBasisQuad2D(const dfloat a, const dfloat b, + const int i, const int j, + dfloat& P){ + P = JacobiP(a,0,0,i)*JacobiP(b,0,0,j); } -void mesh_t::GradOrthonormalBasisQuad2D(dfloat a, dfloat b, int i, int j, dfloat *Pr, dfloat *Ps){ - *Pr = GradJacobiP(a,0,0,i)*JacobiP(b,0,0,j); - *Ps = JacobiP(a,0,0,i)*GradJacobiP(b,0,0,j); +void mesh_t::GradOrthonormalBasisQuad2D(const dfloat a, const dfloat b, + const int i, const int j, + dfloat& Pr, dfloat& Ps){ + Pr = GradJacobiP(a,0,0,i)*JacobiP(b,0,0,j); + Ps = JacobiP(a,0,0,i)*GradJacobiP(b,0,0,j); } // ------------------------------------------------------------------------ // 2D VANDERMONDE MATRICES // ------------------------------------------------------------------------ -void mesh_t::VandermondeQuad2D(int _N, int Npoints, dfloat *_r, dfloat *_s, dfloat *V){ +void mesh_t::VandermondeQuad2D(const int _N, + const memory _r, + const memory _s, + memory& V){ - int _Nq = _N+1; - int _Np = _Nq*_Nq; + const int _Nq = _N+1; + const int _Np = _Nq*_Nq; + const int Npoints = _r.length(); + V.malloc(Npoints*_Np); for(int n=0; n _r, + const memory _s, + memory& Vr, + memory& Vs){ - int _Nq = _N+1; - int _Np = _Nq*_Nq; + const int _Nq = _N+1; + const int _Np = _Nq*_Nq; + const int Npoints = _r.length(); + Vr.malloc(Npoints*_Np); + Vs.malloc(Npoints*_Np); for(int n=0; n V, + memory& _MM){ - // masMatrix = inv(V')*inv(V) = inv(V*V') + // massMatrix = inv(V')*inv(V) = inv(V*V') + _MM.malloc(_Np*_Np); for(int n=0;n<_Np;++n){ for(int m=0;m<_Np;++m){ dfloat res = 0; @@ -212,15 +367,18 @@ void mesh_t::MassMatrixQuad2D(int _Np, dfloat *V, dfloat *_MM){ _MM[n*_Np + m] = res; } } - matrixInverse(_Np, _MM); + linAlg_t::matrixInverse(_Np, _MM); } -void mesh_t::LumpedMassMatrixQuad2D(int _N, dfloat *_gllw, dfloat *_MM){ +void mesh_t::LumpedMassMatrixQuad2D(const int _N, + const memory _gllw, + memory& _MM){ - int _Nq = _N+1; - int _Np = _Nq*_Nq; + const int _Nq = _N+1; + const int _Np = _Nq*_Nq; // LumpedMassMatrix = gllw \ctimes gllw + _MM.malloc(_Np*_Np, 0.0); for(int n=0;n<_Nq;++n){ for(int m=0;m<_Nq;++m){ int id = n+m*_Nq; @@ -229,12 +387,15 @@ void mesh_t::LumpedMassMatrixQuad2D(int _N, dfloat *_gllw, dfloat *_MM){ } } -void mesh_t::invLumpedMassMatrixQuad2D(int _N, dfloat *_gllw, dfloat *_invMM){ +void mesh_t::invLumpedMassMatrixQuad2D(const int _N, + const memory _gllw, + memory& _invMM){ int _Nq = _N+1; int _Np = _Nq*_Nq; // invLumpedMassMatrix = invgllw \ctimes invgllw + _invMM.malloc(_Np*_Np, 0.0); for(int n=0;n<_Nq;++n){ for(int m=0;m<_Nq;++m){ int id = n+m*_Nq; @@ -243,43 +404,51 @@ void mesh_t::invLumpedMassMatrixQuad2D(int _N, dfloat *_gllw, dfloat *_invMM){ } } -void mesh_t::DmatrixQuad2D(int _N, int Npoints, dfloat *_r, dfloat *_s, - dfloat *_Dr, dfloat *_Ds){ +void mesh_t::DmatrixQuad2D(const int _N, + const memory _r, + const memory _s, + memory& _D){ - int _Np = (_N+1)*(_N+1); + const int _Nq = _N+1; + const int _Np = _Nq*_Nq; - dfloat *V = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat)); - dfloat *Vr = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat)); - dfloat *Vs = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat)); - - VandermondeQuad2D(_N, Npoints, _r, _s, V); - GradVandermondeQuad2D(_N, Npoints, _r, _s, Vr, Vs); + memory V, Vr, Vs; + VandermondeQuad2D(_N, _r, _s, V); + GradVandermondeQuad2D(_N, _r, _s, Vr, Vs); //Dr = Vr/V, Ds = Vs/V - matrixRightSolve(_Np, _Np, Vr, _Np, _Np, V, _Dr); - matrixRightSolve(_Np, _Np, Vs, _Np, _Np, V, _Ds); - - free(V); free(Vr); free(Vs); + _D.malloc(2*_Np*_Np); + memory _Dr = _D + 0*_Np*_Np; + memory _Ds = _D + 1*_Np*_Np; + linAlg_t::matrixRightSolve(_Np, _Np, Vr, _Np, _Np, V, _Dr); + linAlg_t::matrixRightSolve(_Np, _Np, Vs, _Np, _Np, V, _Ds); } -void mesh_t::InterpolationMatrixQuad2D(int _N, - int NpointsIn, dfloat *rIn, dfloat *sIn, - int NpointsOut, dfloat *rOut, dfloat *sOut, - dfloat *I){ +void mesh_t::InterpolationMatrixQuad2D(const int _N, + const memory rIn, + const memory sIn, + const memory rOut, + const memory sOut, + memory& I){ - int _Np = (_N+1)*(_N+1); + const int _Nq = _N+1; + const int _Np = _Nq*_Nq; - // need NpointsIn = _Np - if (NpointsIn != _Np) - LIBP_ABORT(string("Invalid Interplation operator requested.")) + const int NpointsIn = rIn.length(); + const int NpointsOut = rOut.length(); - dfloat *VIn = (dfloat*) malloc(NpointsIn*_Np*sizeof(dfloat)); - dfloat *VOut= (dfloat*) malloc(NpointsOut*_Np*sizeof(dfloat)); + // need NpointsIn = _Np + LIBP_ABORT("Invalid Interplation operator requested.", + NpointsIn != _Np); - VandermondeQuad2D(_N, NpointsIn, rIn, sIn, VIn); - VandermondeQuad2D(_N, NpointsOut, rOut, sOut, VOut); + memory VIn; + memory VOut; + VandermondeQuad2D(_N, rIn, sIn, VIn); + VandermondeQuad2D(_N, rOut, sOut, VOut); - matrixRightSolve(NpointsOut, _Np, VOut, NpointsIn, _Np, VIn, I); + I.malloc(NpointsIn*NpointsOut); + linAlg_t::matrixRightSolve(NpointsOut, _Np, VOut, + NpointsIn, _Np, VIn, I); +} - free(VIn); free(VOut); -} \ No newline at end of file +} //namespace libp diff --git a/libs/mesh/meshBasisTet3D.cpp b/libs/mesh/meshBasisTet3D.cpp index 4d2b0828d..4c1386fa4 100644 --- a/libs/mesh/meshBasisTet3D.cpp +++ b/libs/mesh/meshBasisTet3D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim WarburtonTim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,32 +25,34 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh3D.hpp" + +namespace libp { // ------------------------------------------------------------------------ // TET 3D NODES // ------------------------------------------------------------------------ -void mesh_t::NodesTet3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t){ - - int _Np = (_N+1)*(_N+2)*(_N+3)/6; - +void mesh_t::NodesTet3D(const int _N, + memory& _r, + memory& _s, + memory& _t){ EquispacedNodesTet3D(_N, _r, _s, _t); //make equispaced nodes on reference tet - WarpBlendTransformTet3D(_N, _Np, _r, _s, _t); //apply warp&blend transform + WarpBlendTransformTet3D(_N, _r, _s, _t); //apply warp&blend transform } -void mesh_t::FaceNodesTet3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_faceNodes){ - int _Nfp = (_N+1)*(_N+2)/2; - int _Np = (_N+1)*(_N+2)*(_N+3)/6; +void mesh_t::FaceNodesTet3D(const int _N, + const memory _r, + const memory _s, + const memory _t, + memory& _faceNodes){ + const int _Nfp = (_N+1)*(_N+2)/2; + const int _Np = (_N+1)*(_N+2)*(_N+3)/6; int cnt[4]; for (int i=0;i<4;i++) cnt[i]=0; - dfloat deps = 1.; - while((1.+deps)>1.) - deps *= 0.5; - - const dfloat NODETOL = 1000.*deps; + const dfloat NODETOL = 1.0e-5; + _faceNodes.malloc(4*_Nfp); for (int n=0;n<_Np;n++) { if(fabs(_t[n]+1) _r, + const memory _s, + const memory _t, + memory& _vertexNodes){ + const int _Np = (_N+1)*(_N+2)*(_N+3)/6; dfloat deps = 1.; while((1.+deps)>1.) @@ -72,6 +78,7 @@ void mesh_t::VertexNodesTet3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_ const dfloat NODETOL = 1000.*deps; + _vertexNodes.malloc(4); for(int n=0;n<_Np;++n){ if( (_r[n]+1)*(_r[n]+1)+(_s[n]+1)*(_s[n]+1)+(_t[n]+1)*(_t[n]+1) _r, + const memory _s, + const memory _t, + const memory _faceNodes, + const memory _faceVertices, + memory& R){ + + const int _Nfaces = 4; + const int _Nverts = 4; + const int _NfaceVertices = 3; + + const int _Nfp = _faceNodes.length()/_Nfaces; + + const dfloat NODETOL = 1.0e-5; + + dfloat V0[3][2] = {{-1.0,-1.0},{ 1.0,-1.0},{-1.0, 1.0}}; + dfloat V1[3][2] = {{-1.0,-1.0},{-1.0, 1.0},{ 1.0,-1.0}}; + + dfloat EX0[_Nverts], EY0[_Nverts]; + dfloat EX1[_Nverts], EY1[_Nverts]; + + memory x0(_Nfp); + memory y0(_Nfp); + + memory x1(_Nfp); + memory y1(_Nfp); + + R.malloc(_Nfaces*_Nfaces*_NfaceVertices*_Nfp); + + for (int fM=0;fM<_Nfaces;fM++) { + + for (int v=0;v<_Nverts;v++) { + EX0[v] = 0.0; EY0[v] = 0.0; + } + //setup top element with face fM on the bottom + for (int v=0;v<_NfaceVertices;v++) { + int fv = _faceVertices[fM*_NfaceVertices + v]; + EX0[fv] = V0[v][0]; EY0[fv] = V0[v][1]; + } + + for(int n=0;n<_Nfp;++n){ /* for each face node */ + const int fn = _faceNodes[fM*_Nfp+n]; + + /* (r,s,t) coordinates of interpolation nodes*/ + dfloat rn = _r[fn]; + dfloat sn = _s[fn]; + dfloat tn = _t[fn]; + + /* physical coordinate of interpolation node */ + x0[n] = -0.5*(1+rn+sn+tn)*EX0[0] + + 0.5*(1+rn)*EX0[1] + + 0.5*(1+sn)*EX0[2] + + 0.5*(1+tn)*EX0[3]; + y0[n] = -0.5*(1+rn+sn+tn)*EY0[0] + + 0.5*(1+rn)*EY0[1] + + 0.5*(1+sn)*EY0[2] + + 0.5*(1+tn)*EY0[3]; + } + + for (int fP=0;fP<_Nfaces;fP++) { /*For each neighbor face */ + for (int rot=0;rot<_NfaceVertices;rot++) { /* For each face rotation */ + // Zero vertices + for (int v=0;v<_Nverts;v++) { + EX1[v] = 0.0; EY1[v] = 0.0; + } + //setup bottom element with face fP on the top + for (int v=0;v<_NfaceVertices;v++) { + int fv = _faceVertices[fP*_NfaceVertices + ((v+rot)%_NfaceVertices)]; + EX1[fv] = V1[v][0]; EY1[fv] = V1[v][1]; + } + + for(int n=0;n<_Nfp;++n){ /* for each node */ + const int fn = _faceNodes[fP*_Nfp+n]; + + /* (r,s,t) coordinates of interpolation nodes*/ + dfloat rn = _r[fn]; + dfloat sn = _s[fn]; + dfloat tn = _t[fn]; + + /* physical coordinate of interpolation node */ + x1[n] = -0.5*(1+rn+sn+tn)*EX1[0] + + 0.5*(1+rn)*EX1[1] + + 0.5*(1+sn)*EX1[2] + + 0.5*(1+tn)*EX1[3]; + y1[n] = -0.5*(1+rn+sn+tn)*EY1[0] + + 0.5*(1+rn)*EY1[1] + + 0.5*(1+sn)*EY1[2] + + 0.5*(1+tn)*EY1[3]; + } + + /* for each node on this face find the neighbor node */ + for(int n=0;n<_Nfp;++n){ + const dfloat xM = x0[n]; + const dfloat yM = y0[n]; + + int m=0; + for(;m<_Nfp;++m){ /* for each neighbor node */ + const dfloat xP = x1[m]; + const dfloat yP = y1[m]; + + /* distance between target and neighbor node */ + const dfloat dist = pow(xM-xP,2) + pow(yM-yP,2); + + /* if neighbor node is close to target, match */ + if(distNODETOL); + } + } + } + } +} + // Create equidistributed nodes on reference tet -void mesh_t::EquispacedNodesTet3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t){ +void mesh_t::EquispacedNodesTet3D(const int _N, + memory& _r, + memory& _s, + memory& _t){ + const int _Np = (_N+1)*(_N+2)*(_N+3)/6; + + _r.malloc(_Np); + _s.malloc(_Np); + _t.malloc(_Np); int sk = 0; for (int k=0;k<_N+1;k++) { @@ -100,8 +247,11 @@ void mesh_t::EquispacedNodesTet3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t){ } } -void mesh_t::EquispacedEToVTet3D(int _N, int *_EToV){ - int _Nverts = 4; +void mesh_t::EquispacedEToVTet3D(const int _N, memory& _EToV){ + const int _Nverts = 4; + const int _Nelements = _N*_N*_N; + + _EToV.malloc(_Nelements*_Nverts); int cnt=0; int sk=0; @@ -161,14 +311,16 @@ void mesh_t::EquispacedEToVTet3D(int _N, int *_EToV){ } } -void mesh_t::SEMFEMEToVTet3D(int _N, int *_EToV){ +void mesh_t::SEMFEMEToVTet3D(const int _N, memory& _EToV){ EquispacedEToVTet3D(_N, _EToV); } // ------------------------------------------------------------------------ // ORTHONORMAL BASIS POLYNOMIALS // ------------------------------------------------------------------------ -void mesh_t::OrthonormalBasisTet3D(dfloat _r, dfloat _s, dfloat _t, int i, int j, int k, dfloat *P){ +void mesh_t::OrthonormalBasisTet3D(const dfloat _r, const dfloat _s, const dfloat _t, + const int i, const int j, const int k, + dfloat& P){ // First convert to abc coordinates dfloat a, b, c; if(fabs(_s+_t)>1e-8) @@ -187,11 +339,12 @@ void mesh_t::OrthonormalBasisTet3D(dfloat _r, dfloat _s, dfloat _t, int i, int j dfloat p2 = JacobiP(b,2*i+1,0,j); dfloat p3 = JacobiP(c,2*(i+j)+2,0,k); - *P = 2.*sqrt(2.0)*p1*p2*p3*pow(1.0-b,i)*pow(1.0-c,i+j); + P = 2.*sqrt(2.0)*p1*p2*p3*pow(1.0-b,i)*pow(1.0-c,i+j); } -void mesh_t::GradOrthonormalBasisTet3D(dfloat _r, dfloat _s, dfloat _t, - int i, int j, int k, dfloat *Pr, dfloat *Ps, dfloat *Pt){ +void mesh_t::GradOrthonormalBasisTet3D(const dfloat _r, const dfloat _s, const dfloat _t, + const int i, const int j, const int k, + dfloat& Pr, dfloat& Ps, dfloat& Pt){ // First convert to abc coordinates dfloat a, b, c; if(fabs(_s+_t)>1e-8) @@ -214,48 +367,54 @@ void mesh_t::GradOrthonormalBasisTet3D(dfloat _r, dfloat _s, dfloat _t, dfloat p2b = GradJacobiP(b,2*i+1,0,j); dfloat p3c = GradJacobiP(c,2*(i+j)+2,0,k); - *Pr = p1a*p2*p3; + Pr = p1a*p2*p3; if(i>0) - *Pr *= pow(0.5*(1.0-b), i-1); + Pr *= pow(0.5*(1.0-b), i-1); if(i+j>0) - *Pr *= pow(0.5*(1.0-c), i+j-1); + Pr *= pow(0.5*(1.0-c), i+j-1); - *Ps = 0.5*(1.0+a)*(*Pr); + Ps = 0.5*(1.0+a)*(Pr); dfloat tmp = p2b*pow(0.5*(1.0-b), i); if(i>0) tmp += -0.5*i*p2*pow(0.5*(1.0-b), i-1); if(i+j>0) tmp *= pow(0.5*(1.0-c), i+j-1); tmp *= p1*p3; - *Ps += tmp; + Ps += tmp; - *Pt = 0.5*(1.0+a)*(*Pr) + 0.5*(1.0+b)*tmp; + Pt = 0.5*(1.0+a)*(Pr) + 0.5*(1.0+b)*tmp; tmp = p3c*pow(0.5*(1-c), i+j); if(i+j>0) tmp -= 0.5*(i+j)*(p3*pow(0.5*(1.0-c), i+j-1)); tmp *= p1*p2*pow(0.5*(1-b), i); - *Pt += tmp; + Pt += tmp; - *Pr *= pow(2, 2*i+j+1.5); - *Ps *= pow(2, 2*i+j+1.5); - *Pt *= pow(2, 2*i+j+1.5); + Pr *= pow(2, 2*i+j+1.5); + Ps *= pow(2, 2*i+j+1.5); + Pt *= pow(2, 2*i+j+1.5); } // ------------------------------------------------------------------------ // 3D VANDERMONDE MATRICES // ------------------------------------------------------------------------ -void mesh_t::VandermondeTet3D(int _N, int Npoints, dfloat *_r, dfloat *_s, dfloat *_t, dfloat *V){ +void mesh_t::VandermondeTet3D(const int _N, + const memory _r, + const memory _s, + const memory _t, + memory& V){ - int _Np = (_N+1)*(_N+2)*(_N+3)/6; + const int _Np = (_N+1)*(_N+2)*(_N+3)/6; + const int Npoints = _r.length(); + V.malloc(Npoints*_Np); for(int n=0; n _r, + const memory _s, + const memory _t, + memory& Vr, + memory& Vs, + memory& Vt){ - int _Np = (_N+1)*(_N+2)*(_N+3)/6; + const int _Np = (_N+1)*(_N+2)*(_N+3)/6; + const int Npoints = _r.length(); + Vr.malloc(Npoints*_Np); + Vs.malloc(Npoints*_Np); + Vt.malloc(Npoints*_Np); for(int n=0; n V, + memory& _MM){ // massMatrix = inv(V')*inv(V) = inv(V*V') + _MM.malloc(_Np*_Np); for(int n=0;n<_Np;++n){ for(int m=0;m<_Np;++m){ dfloat res = 0; @@ -297,12 +468,15 @@ void mesh_t::MassMatrixTet3D(int _Np, dfloat *V, dfloat *_MM){ _MM[n*_Np + m] = res; } } - matrixInverse(_Np, _MM); + linAlg_t::matrixInverse(_Np, _MM); } -void mesh_t::invMassMatrixTet3D(int _Np, dfloat *V, dfloat *_invMM){ +void mesh_t::invMassMatrixTet3D(const int _Np, + const memory V, + memory& _invMM){ // massMatrix^{-1} = V*V' + _invMM.malloc(_Np*_Np); for(int n=0;n<_Np;++n){ for(int m=0;m<_Np;++m){ dfloat res = 0; @@ -314,43 +488,46 @@ void mesh_t::invMassMatrixTet3D(int _Np, dfloat *V, dfloat *_invMM){ } } -void mesh_t::DmatrixTet3D(int _N, int Npoints, dfloat *_r, dfloat *_s, dfloat *_t, - dfloat *_Dr, dfloat *_Ds, dfloat *_Dt){ +void mesh_t::DmatrixTet3D(const int _N, + const memory _r, + const memory _s, + const memory _t, + memory& _D){ - int _Np = (_N+1)*(_N+2)*(_N+3)/6; + const int _Np = (_N+1)*(_N+2)*(_N+3)/6; - dfloat *V = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat)); - dfloat *Vr = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat)); - dfloat *Vs = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat)); - dfloat *Vt = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat)); - - VandermondeTet3D(_N, Npoints, _r, _s, _t, V); - GradVandermondeTet3D(_N, Npoints, _r, _s, _t, Vr, Vs, Vt); + memory V, Vr, Vs, Vt; + VandermondeTet3D(_N, _r, _s, _t, V); + GradVandermondeTet3D(_N, _r, _s, _t, Vr, Vs, Vt); //Dr = Vr/V, Ds = Vs/V - matrixRightSolve(_Np, _Np, Vr, _Np, _Np, V, _Dr); - matrixRightSolve(_Np, _Np, Vs, _Np, _Np, V, _Ds); - matrixRightSolve(_Np, _Np, Vt, _Np, _Np, V, _Dt); - - free(V); free(Vr); free(Vs); free(Vt); + _D.malloc(3*_Np*_Np); + memory _Dr = _D + 0*_Np*_Np; + memory _Ds = _D + 1*_Np*_Np; + memory _Dt = _D + 2*_Np*_Np; + linAlg_t::matrixRightSolve(_Np, _Np, Vr, _Np, _Np, V, _Dr); + linAlg_t::matrixRightSolve(_Np, _Np, Vs, _Np, _Np, V, _Ds); + linAlg_t::matrixRightSolve(_Np, _Np, Vt, _Np, _Np, V, _Dt); } -void mesh_t::LIFTmatrixTet3D(int _N, int *_faceNodes, - dfloat *_r, dfloat *_s, dfloat *_t, dfloat *_LIFT){ +void mesh_t::LIFTmatrixTet3D(const int _N, + const memory _faceNodes, + const memory _r, + const memory _s, + const memory _t, + memory& _LIFT){ - int _Nfp = (_N+1)*(_N+2)/2; - int _Np = (_N+1)*(_N+2)*(_N+3)/6; - int _Nfaces = 4; + const int _Nfp = (_N+1)*(_N+2)/2; + const int _Np = (_N+1)*(_N+2)*(_N+3)/6; + const int _Nfaces = 4; - dfloat *E = (dfloat *) calloc(_Np*_Nfaces*_Nfp, sizeof(dfloat)); + memory E(_Np*_Nfaces*_Nfp, 0); - dfloat *r2D = (dfloat *) malloc(_Nfp*sizeof(dfloat)); - dfloat *s2D = (dfloat *) malloc(_Nfp*sizeof(dfloat)); - dfloat *V2D = (dfloat *) malloc(_Nfp*_Nfp*sizeof(dfloat)); - dfloat *MM2D = (dfloat *) malloc(_Nfp*_Nfp*sizeof(dfloat)); + memory r2D(_Nfp); + memory s2D(_Nfp); for (int f=0;f<_Nfaces;f++) { - dfloat *rFace, *sFace; + memory rFace, sFace; if (f==0) {rFace = _r; sFace = _s;} if (f==1) {rFace = _r; sFace = _t;} if (f==2) {rFace = _s; sFace = _t;} @@ -361,7 +538,8 @@ void mesh_t::LIFTmatrixTet3D(int _N, int *_faceNodes, s2D[i] = sFace[_faceNodes[f*_Nfp+i]]; } - VandermondeTri2D(_N, _Nfp, r2D, s2D, V2D); + memory V2D, MM2D; + VandermondeTri2D(_N, r2D, s2D, V2D); MassMatrixTri2D(_Nfp, V2D, MM2D); for (int j=0;j<_Nfp;j++) { @@ -372,9 +550,10 @@ void mesh_t::LIFTmatrixTet3D(int _N, int *_faceNodes, } } - dfloat *V = (dfloat *) malloc(_Np*_Np*sizeof(dfloat)); - VandermondeTet3D(_N, _Np, _r, _s, _t, V); + memory V; + VandermondeTet3D(_N, _r, _s, _t, V); + _LIFT.malloc(_Np*_Nfaces*_Nfp); for (int n=0;n<_Np;n++) { for (int m=0;m<_Nfaces*_Nfp;m++) { @@ -388,16 +567,18 @@ void mesh_t::LIFTmatrixTet3D(int _N, int *_faceNodes, } } } - - free(V); free(r2D); free(s2D); free(V2D); free(MM2D); free(E); } -void mesh_t::SurfaceMassMatrixTet3D(int _N, dfloat *_MM, dfloat *_LIFT, dfloat *_sM){ +void mesh_t::SurfaceMassMatrixTet3D(const int _N, + const memory _MM, + const memory _LIFT, + memory& _sM){ - int _Nfp = (_N+1)*(_N+2)/2; - int _Np = (_N+1)*(_N+2)*(_N+3)/6; - int _Nfaces = 4; + const int _Nfp = (_N+1)*(_N+2)/2; + const int _Np = (_N+1)*(_N+2)*(_N+3)/6; + const int _Nfaces = 4; + _sM.malloc(_Np*_Nfaces*_Nfp); for (int n=0;n<_Np;n++) { for (int m=0;m<_Nfp*_Nfaces;m++) { _sM[m+n*_Nfp*_Nfaces] = 0; @@ -408,12 +589,22 @@ void mesh_t::SurfaceMassMatrixTet3D(int _N, dfloat *_MM, dfloat *_LIFT, dfloat * } } -void mesh_t::SmatrixTet3D(int _N, dfloat *_Dr, dfloat *_Ds, dfloat *_Dt, dfloat *_MM, - dfloat *_Srr, dfloat *_Srs, dfloat *_Srt, - dfloat *_Sss, dfloat *_Sst, dfloat *_Stt){ - - int _Np = (_N+1)*(_N+2)*(_N+3)/6; - +void mesh_t::SmatrixTet3D(const int _N, + const memory _Dr, + const memory _Ds, + const memory _Dt, + const memory _MM, + memory& _S){ + + const int _Np = (_N+1)*(_N+2)*(_N+3)/6; + + _S.malloc(6*_Np*_Np, 0.0); + memory _Srr = _S + 0*_Np*_Np; + memory _Srs = _S + 1*_Np*_Np; + memory _Srt = _S + 2*_Np*_Np; + memory _Sss = _S + 3*_Np*_Np; + memory _Sst = _S + 4*_Np*_Np; + memory _Stt = _S + 5*_Np*_Np; for (int n=0;n<_Np;n++) { for (int m=0;m<_Np;m++) { for (int k=0;k<_Np;k++) { @@ -433,59 +624,65 @@ void mesh_t::SmatrixTet3D(int _N, dfloat *_Dr, dfloat *_Ds, dfloat *_Dt, dfloat } } -void mesh_t::InterpolationMatrixTet3D(int _N, - int NpointsIn, dfloat *rIn, dfloat *sIn, dfloat *tIn, - int NpointsOut, dfloat *rOut, dfloat *sOut, dfloat *tOut, - dfloat *I){ - - int _Np = (_N+1)*(_N+2)*(_N+3)/6; +void mesh_t::InterpolationMatrixTet3D(const int _N, + const memory rIn, + const memory sIn, + const memory tIn, + const memory rOut, + const memory sOut, + const memory tOut, + memory& I){ - // need NpointsIn = _Np - if (NpointsIn != _Np) - LIBP_ABORT(string("Invalid Interplation operator requested.")) + const int _Np = (_N+1)*(_N+2)*(_N+3)/6; - dfloat *VIn = (dfloat*) malloc(NpointsIn*_Np*sizeof(dfloat)); - dfloat *VOut= (dfloat*) malloc(NpointsOut*_Np*sizeof(dfloat)); + const int NpointsIn = rIn.length(); + const int NpointsOut = rOut.length(); - VandermondeTet3D(_N, NpointsIn, rIn, sIn, tIn, VIn); - VandermondeTet3D(_N, NpointsOut, rOut, sOut, tOut, VOut); + // need NpointsIn = _Np + LIBP_ABORT("Invalid Interplation operator requested.", + NpointsIn != _Np); - matrixRightSolve(NpointsOut, _Np, VOut, NpointsIn, _Np, VIn, I); + memory VIn; + memory VOut; + VandermondeTet3D(_N, rIn, sIn, tIn, VIn); + VandermondeTet3D(_N, rOut, sOut, tOut, VOut); - free(VIn); free(VOut); + I.malloc(NpointsIn*NpointsOut); + linAlg_t::matrixRightSolve(NpointsOut, _Np, VOut, + NpointsIn, _Np, VIn, I); } -void mesh_t::DegreeRaiseMatrixTet3D(int Nc, int Nf, dfloat *P){ - - int Npc = (Nc+1)*(Nc+2)*(Nc+3)/6; - int Npf = (Nf+1)*(Nf+2)*(Nf+3)/6; - - dfloat *rc = (dfloat *) malloc(Npc*sizeof(dfloat)); - dfloat *sc = (dfloat *) malloc(Npc*sizeof(dfloat)); - dfloat *tc = (dfloat *) malloc(Npc*sizeof(dfloat)); - dfloat *rf = (dfloat *) malloc(Npf*sizeof(dfloat)); - dfloat *sf = (dfloat *) malloc(Npf*sizeof(dfloat)); - dfloat *tf = (dfloat *) malloc(Npf*sizeof(dfloat)); +void mesh_t::DegreeRaiseMatrixTet3D(const int Nc, const int Nf, + memory& P){ + memory rc, sc, tc; + memory rf, sf, tf; NodesTet3D(Nc, rc, sc, tc); NodesTet3D(Nf, rf, sf, tf); - InterpolationMatrixTet3D(Nc, Npc, rc, sc, tc, Npf, rf, sf, tf, P); - - free(rc); free(sc); free(tc); free(rf); free(sf); free(tf); + InterpolationMatrixTet3D(Nc, rc, sc, tc, rf, sf, tf, P); } -void mesh_t::CubaturePmatrixTet3D(int _N, int _Np, dfloat *_r, dfloat *_s, dfloat *_t, - int _cubNp, dfloat *_cubr, dfloat *_cubs, dfloat *_cubt, - dfloat *_cubProject){ +void mesh_t::CubaturePmatrixTet3D(const int _N, + const memory _r, + const memory _s, + const memory _t, + const memory _cubr, + const memory _cubs, + const memory _cubt, + memory& _cubProject){ + + const int _Np = (_N+1)*(_N+2)*(_N+3)/6; + const int _cubNp = _cubr.length(); - dfloat *V = (dfloat*) malloc(_Np*_Np*sizeof(dfloat)); - VandermondeTet3D(_N, _Np, _r, _s, _t, V); + memory V; + VandermondeTet3D(_N, _r, _s, _t, V); - dfloat *cubV = (dfloat*) malloc(_cubNp*_Np*sizeof(dfloat)); - VandermondeTet3D(_N, _cubNp, _cubr, _cubs, _cubt, cubV); + memory cubV; + VandermondeTet3D(_N, _cubr, _cubs, _cubt, cubV); // cubProject = V*cV' %% relies on (transpose(cV)*diag(cubw)*cV being the identity) + _cubProject.malloc(_Np*_cubNp); for(int n=0;n<_Np;++n){ for(int m=0;m<_cubNp;++m){ dfloat resP = 0; @@ -495,26 +692,34 @@ void mesh_t::CubaturePmatrixTet3D(int _N, int _Np, dfloat *_r, dfloat *_s, dfloa _cubProject[n*_cubNp+m] = resP; } } - free(V); free(cubV); } -void mesh_t::CubatureWeakDmatricesTet3D(int _N, int _Np, dfloat *_r, dfloat *_s, dfloat *_t, - int _cubNp, dfloat *_cubr, dfloat *_cubs, dfloat *_cubt, - dfloat *_cubPDrT, dfloat *_cubPDsT, dfloat *_cubPDtT){ +void mesh_t::CubatureWeakDmatricesTet3D(const int _N, + const memory _r, + const memory _s, + const memory _t, + const memory _cubr, + const memory _cubs, + const memory _cubt, + memory& _cubPDT){ - dfloat *V = (dfloat*) malloc(_Np*_Np*sizeof(dfloat)); - VandermondeTet3D(_N, _Np, _r, _s, _t, V); + const int _Np = (_N+1)*(_N+2)*(_N+3)/6; + const int _cubNp = _cubr.length(); - dfloat *cubV = (dfloat*) malloc(_cubNp*_Np*sizeof(dfloat)); - dfloat *cubVr = (dfloat*) malloc(_cubNp*_Np*sizeof(dfloat)); - dfloat *cubVs = (dfloat*) malloc(_cubNp*_Np*sizeof(dfloat)); - dfloat *cubVt = (dfloat*) malloc(_cubNp*_Np*sizeof(dfloat)); - VandermondeTet3D(_N, _cubNp, _cubr, _cubs, _cubt, cubV); - GradVandermondeTet3D(_N, _cubNp, _cubr, _cubs, _cubt, cubVr, cubVs, cubVt); + memory V; + VandermondeTet3D(_N, _r, _s, _t, V); + + memory cubV, cubVr, cubVs, cubVt; + VandermondeTet3D(_N, _cubr, _cubs, _cubt, cubV); + GradVandermondeTet3D(_N, _cubr, _cubs, _cubt, cubVr, cubVs, cubVt); // cubPDrT = V*transpose(cVr); // cubPDsT = V*transpose(cVs); // cubPDtT = V*transpose(cVt); + _cubPDT.malloc(3*_Np*_cubNp); + memory _cubPDrT = _cubPDT + 0*_Np*_cubNp; + memory _cubPDsT = _cubPDT + 1*_Np*_cubNp; + memory _cubPDtT = _cubPDT + 2*_Np*_cubNp; for(int n=0;n<_Np;++n){ for(int m=0;m<_cubNp;++m){ dfloat resPDrT = 0, resPDsT = 0, resPDtT = 0; @@ -531,23 +736,31 @@ void mesh_t::CubatureWeakDmatricesTet3D(int _N, int _Np, dfloat *_r, dfloat *_s, _cubPDtT[n*_cubNp+m] = resPDtT; } } - free(V); free(cubV); free(cubVr); free(cubVs); free(cubVt); } -void mesh_t::CubatureSurfaceMatricesTet3D(int _N, int _Np, dfloat *_r, dfloat *_s, dfloat *_t, int *_faceNodes, - int _intNfp, dfloat *_intr, dfloat *_ints, dfloat *_intw, - dfloat *_intInterp, dfloat *_intLIFT){ - - int _Nfaces = 4; - int _Nfp = (_N+1)*(_N+2)/2; - - dfloat *V = (dfloat*) malloc(_Np*_Np*sizeof(dfloat)); - VandermondeTet3D(_N, _Np, _r, _s, _t, V); - - dfloat *ir = (dfloat*) calloc(_intNfp*_Nfaces, sizeof(dfloat)); - dfloat *is = (dfloat*) calloc(_intNfp*_Nfaces, sizeof(dfloat)); - dfloat *it = (dfloat*) calloc(_intNfp*_Nfaces, sizeof(dfloat)); - dfloat *iw = (dfloat*) calloc(_intNfp*_Nfaces, sizeof(dfloat)); +void mesh_t::CubatureSurfaceMatricesTet3D(const int _N, + const memory _r, + const memory _s, + const memory _t, + const memory _faceNodes, + const memory _intr, + const memory _ints, + const memory _intw, + memory& _intInterp, + memory& _intLIFT){ + + const int _Np = (_N+1)*(_N+2)*(_N+3)/6; + const int _Nfp = (_N+1)*(_N+2)/2; + const int _Nfaces = 4; + const int _intNfp = _intr.length(); + + memory V; + VandermondeTet3D(_N, _r, _s, _t, V); + + memory ir(_intNfp*_Nfaces); + memory is(_intNfp*_Nfaces); + memory it(_intNfp*_Nfaces); + memory iw(_intNfp*_Nfaces); for(int n=0;n<_intNfp;++n){ ir[0*_intNfp + n] = _intr[n]; @@ -571,9 +784,10 @@ void mesh_t::CubatureSurfaceMatricesTet3D(int _N, int _Np, dfloat *_r, dfloat *_ iw[3*_intNfp + n] = _intw[n]; } - dfloat *sInterp = (dfloat*) malloc(_intNfp*_Nfaces*_Np*sizeof(dfloat)); - InterpolationMatrixTet3D(_N, _Np, _r, _s, _t, _Nfaces*_intNfp, ir, is, it, sInterp); + memory sInterp; + InterpolationMatrixTet3D(_N, _r, _s, _t, ir, is, it, sInterp); + _intInterp.malloc(_Nfaces*_intNfp*_Nfp); for(int n=0;n<_intNfp;++n){ for(int m=0;m<_Nfp;++m){ _intInterp[0*_intNfp*_Nfp + n*_Nfp + m] = sInterp[(n+0*_intNfp)*_Np+_faceNodes[0*_Nfp+m]]; @@ -585,6 +799,7 @@ void mesh_t::CubatureSurfaceMatricesTet3D(int _N, int _Np, dfloat *_r, dfloat *_ // integration node lift matrix //iLIFT = V*V'*sInterp'*diag(iw(:)); + _intLIFT.malloc(_Nfaces*_intNfp*_Np); for(int n=0;n<_Nfaces*_intNfp;++n){ for(int m=0;m<_Np;++m){ _intLIFT[m*_Nfaces*_intNfp+n] = 0.0; @@ -595,19 +810,24 @@ void mesh_t::CubatureSurfaceMatricesTet3D(int _N, int _Np, dfloat *_r, dfloat *_ } } } - - free(V); free(ir); free(is); free(it); free(iw); free(sInterp); } -void mesh_t::SEMFEMInterpMatrixTet3D(int _N, - int _Np, dfloat *_r, dfloat *_s, dfloat *_t, - int _NpFEM, dfloat *_rFEM, dfloat *_sFEM, dfloat *_tFEM, - dfloat *I){ +void mesh_t::SEMFEMInterpMatrixTet3D(const int _N, + const memory _r, + const memory _s, + const memory _t, + const memory _rFEM, + const memory _sFEM, + const memory _tFEM, + memory& I){ + + const int _Np = (_N+1)*(_N+2)*(_N+3)/6; + const int _NpFEM = _rFEM.length(); - dfloat *IQN = (dfloat*) malloc(_NpFEM*_Np*sizeof(dfloat)); - InterpolationMatrixTet3D(_N, _Np, _r, _s, _t, _NpFEM, _rFEM, _sFEM, _tFEM, IQN); + memory IQN; + InterpolationMatrixTet3D(_N, _r, _s, _t, _rFEM, _sFEM, _tFEM, IQN); - dfloat *IQTIQ = (dfloat*) malloc(_Np*_Np*sizeof(dfloat)); + memory IQTIQ(_Np*_Np); // IQTIQ = IQN'*IQN for(int n=0;n<_Np;++n){ for(int m=0;m<_Np;++m){ @@ -619,9 +839,7 @@ void mesh_t::SEMFEMInterpMatrixTet3D(int _N, } // I = IQN/(IQN'*IQN) - pseudo inverse - matrixRightSolve(_NpFEM, _Np, IQN, _Np, _Np, IQTIQ, I); - - free(IQN); free(IQTIQ); + linAlg_t::matrixRightSolve(_NpFEM, _Np, IQN, _Np, _Np, IQTIQ, I); } // ------------------------------------------------------------------------ @@ -630,16 +848,24 @@ void mesh_t::SEMFEMInterpMatrixTet3D(int _N, // Journal of engineering mathematics, 56(3), 247-262. // ------------------------------------------------------------------------ -static void xyztorst(int Npoints, dfloat *x, dfloat *y, dfloat *z, dfloat *r, dfloat *s, dfloat *t) { +static void xyztorst(const memory x, + const memory y, + const memory z, + memory r, + memory s, + memory t) { + + const int Npoints = x.length(); + // vertices of tetrahedron dfloat v1[3] = {-1.0, -1./sqrt(3.), -1./sqrt(6.)}; dfloat v2[3] = { 1.0, -1./sqrt(3.), -1./sqrt(6.)}; dfloat v3[3] = { 0.0, 2./sqrt(3.), -1./sqrt(6.)}; dfloat v4[3] = { 0.0, 0., 3./sqrt(6.)}; - dfloat *XYZ = (dfloat *) malloc(3*Npoints*sizeof(dfloat)); - dfloat *RST = (dfloat *) malloc(3*Npoints*sizeof(dfloat)); - dfloat *A = (dfloat *) malloc(3*3*sizeof(dfloat)); + memory XYZ(3*Npoints); + memory RST(3*Npoints); + memory A(3*3); for (int i=0;i<3;i++) { A[0*3+i] = 0.5*(v2[i]-v1[i]); @@ -653,30 +879,33 @@ static void xyztorst(int Npoints, dfloat *x, dfloat *y, dfloat *z, dfloat *r, df XYZ[3*n+2] = z[n]-0.5*(v2[2]+v3[2]+v4[2]-v1[2]); } - matrixRightSolve(Npoints, 3, XYZ, 3, 3, A, RST); + linAlg_t::matrixRightSolve(Npoints, 3, XYZ, 3, 3, A, RST); for (int n=0;n L1, + const memory L2, + const memory L3, + memory w1, + memory w2) { // Compute scaled warp function at order N // based on rout interpolation nodes - dfloat *dL32 = (dfloat*) malloc(Npoints*sizeof(dfloat)); - dfloat *dL13 = (dfloat*) malloc(Npoints*sizeof(dfloat)); - dfloat *dL21 = (dfloat*) malloc(Npoints*sizeof(dfloat)); + const int Npoints = L1.length(); - dfloat *warpf1 = (dfloat*) malloc(Npoints*sizeof(dfloat)); - dfloat *warpf2 = (dfloat*) malloc(Npoints*sizeof(dfloat)); - dfloat *warpf3 = (dfloat*) malloc(Npoints*sizeof(dfloat)); + memory dL32(Npoints); + memory dL13(Npoints); + memory dL21(Npoints); + + memory warpf1(Npoints); + memory warpf2(Npoints); + memory warpf3(Npoints); for (int n=0;n _r, + memory _s, + memory _t, + const dfloat alphaIn){ const dfloat alpopt[15] = {0.0000,0.0000,0.00000,0.1002,1.1332,1.5608,1.3413, 1.2577,1.1603,1.10153,0.6080,0.4523,0.8856,0.8717,0.9655}; + const int _Npoints = _r.length(); + dfloat alpha; if (alphaIn==-1) { if (_N<16) { @@ -746,18 +978,18 @@ void mesh_t::WarpBlendTransformTet3D(int _N, int _Npoints, dfloat *_r, dfloat *_ } // Convert r s coordinates to points in equilateral triangle - dfloat *L1 = (dfloat*) malloc(_Npoints*sizeof(dfloat)); - dfloat *L2 = (dfloat*) malloc(_Npoints*sizeof(dfloat)); - dfloat *L3 = (dfloat*) malloc(_Npoints*sizeof(dfloat)); - dfloat *L4 = (dfloat*) malloc(_Npoints*sizeof(dfloat)); + memory L1(_Npoints); + memory L2(_Npoints); + memory L3(_Npoints); + memory L4(_Npoints); - dfloat *_x = (dfloat*) malloc(_Npoints*sizeof(dfloat)); - dfloat *_y = (dfloat*) malloc(_Npoints*sizeof(dfloat)); - dfloat *_z = (dfloat*) malloc(_Npoints*sizeof(dfloat)); + memory _x(_Npoints); + memory _y(_Npoints); + memory _z(_Npoints); - dfloat *shiftx = (dfloat*) calloc(_Npoints,sizeof(dfloat)); - dfloat *shifty = (dfloat*) calloc(_Npoints,sizeof(dfloat)); - dfloat *shiftz = (dfloat*) calloc(_Npoints,sizeof(dfloat)); + memory shiftx(_Npoints,0.0); + memory shifty(_Npoints,0.0); + memory shiftz(_Npoints,0.0); for (int n=0;n<_Npoints;n++) { L1[n] = 0.5*(1.+_t[n]); @@ -770,18 +1002,18 @@ void mesh_t::WarpBlendTransformTet3D(int _N, int _Npoints, dfloat *_r, dfloat *_ _z[n] = L3[n]*v1[2]+L4[n]*v2[2]+L2[n]*v3[2]+L1[n]*v4[2]; } - dfloat *warp1 = (dfloat*) calloc(_Npoints,sizeof(dfloat)); - dfloat *warp2 = (dfloat*) calloc(_Npoints,sizeof(dfloat)); + memory warp1(_Npoints); + memory warp2(_Npoints); for (int f=0;f<4;f++) { - dfloat *La, *Lb, *Lc, *Ld; + memory La, Lb, Lc, Ld; if(f==0) {La = L1; Lb = L2; Lc = L3; Ld = L4;} if(f==1) {La = L2; Lb = L1; Lc = L3; Ld = L4;} if(f==2) {La = L3; Lb = L1; Lc = L4; Ld = L2;} if(f==3) {La = L4; Lb = L1; Lc = L3; Ld = L2;} // compute warp tangential to face - WarpShiftFace3D(_N, _Npoints, alpha, Lb, Lc, Ld, warp1, warp2); + WarpShiftFace3D(_N, alpha, Lb, Lc, Ld, warp1, warp2); for (int n=0;n<_Npoints;n++) { dfloat blend = Lb[n]*Lc[n]*Ld[n]; @@ -811,12 +1043,7 @@ void mesh_t::WarpBlendTransformTet3D(int _N, int _Npoints, dfloat *_r, dfloat *_ _z[n] += shiftz[n]; } - xyztorst(_Npoints, _x, _y, _z, _r, _s, _t); - - free(L1); free(L2); free(L3); free(L4); - free(warp1); free(warp2); - free(shiftx); free(shifty); free(shiftz); - free(_x); free(_y); free(_z); + xyztorst(_x, _y, _z, _r, _s, _t); } // ------------------------------------------------------------------------ @@ -901,19 +1128,22 @@ static const dfloat cubT15[214] = {-3.592259421353274e-01,-3.592259421353629e-01 static const dfloat cubW15[214] = { 3.522723551354820e-03, 3.522723551352486e-03, 3.522723551352938e-03, 9.232955535331875e-03, 9.232955535327506e-03, 9.232955535330659e-03, 4.237026901463632e-03, 4.237026901464212e-03, 4.237026901463377e-03, 6.106499343749692e-03, 6.106499343748844e-03, 6.106499343750059e-03, 1.627360858046573e-03, 1.627360858046587e-03, 1.627360858046813e-03, 1.148548912222280e-03, 1.148548912221887e-03, 1.148548912221821e-03, 2.564399625003663e-03, 2.564399625004229e-03, 2.564399625002914e-03, 5.856076670044469e-03, 5.856076670043464e-03, 5.856076670043295e-03, 1.423643710751563e-03, 1.423643710751888e-03, 1.423643710751167e-03, 3.978120910726397e-03, 3.978120910728038e-03, 3.978120910727048e-03, 1.363809538497426e-03, 1.363809538497801e-03, 1.363809538497208e-03, 4.075959956903368e-04, 4.075959956909096e-04, 4.075959956900526e-04, 2.860089500953389e-03, 2.860089500953686e-03, 2.860089500953841e-03, 4.850559515934299e-03, 4.850559515933083e-03, 4.850559515932247e-03, 1.352971840215884e-02, 1.352971840216170e-02, 1.352971840215909e-02, 6.132094336152466e-03, 6.132094336152919e-03, 6.132094336153004e-03, 1.461675478856591e-02, 1.461675478855672e-02, 1.461675478856139e-02, 5.990023547122631e-03, 5.990023547123226e-03, 5.990023547122334e-03, 9.631452064974138e-03, 9.631452064972767e-03, 9.631452064974280e-03, 5.614990995964819e-03, 5.614990995964790e-03, 5.614990995963984e-03, 1.144090371383849e-03, 1.144090371383791e-03, 1.144090371383924e-03, 7.816866183700298e-03, 7.816866183699294e-03, 7.816866183700680e-03, 1.639566856148552e-02, 1.639566856149825e-02, 1.639566856148552e-02, 6.721523996645333e-03, 6.721523996646124e-03, 6.721523996646986e-03, 6.261479412226208e-03, 6.261479412226463e-03, 6.261479412227057e-03, 1.527689367197913e-02, 1.527689367198168e-02, 1.527689367197715e-02, 4.042885330071795e-03, 4.042885330072389e-03, 4.042885330072658e-03, 1.229463521022889e-02, 1.229463521022621e-02, 1.229463521022872e-02, 5.427737795462986e-03, 5.427737795463169e-03, 5.427737795462180e-03, 1.976199256754921e-03, 1.976199256754936e-03, 1.976199256754398e-03, 6.919752984901465e-03, 6.919752984901423e-03, 6.919752984901507e-03, 1.046027736959284e-02, 1.046027736959218e-02, 1.046027736959269e-02, 1.447481953654798e-02, 1.447481953654770e-02, 1.447481953654855e-02, 2.391550565412188e-03, 2.391550565412258e-03, 2.391550565412131e-03, 1.511375252393240e-02, 1.511375252392830e-02, 1.511375252393155e-02, 9.511095825311733e-03, 9.511095825312935e-03, 9.511095825312142e-03, 4.036967691998535e-03, 4.036967691999214e-03, 4.036967691998252e-03, 1.508260348095485e-03, 1.508260348095230e-03, 1.508260348095372e-03, 3.937346811175521e-03, 3.937346811175041e-03, 3.937346811175210e-03, 3.608140871956763e-03, 3.608140871956792e-03, 3.608140871956891e-03, 3.365526474882623e-03, 3.365526474883641e-03, 3.365526474884108e-03, 5.192349870771271e-03, 5.192349870771441e-03, 5.192349870772318e-03, 1.334407891882406e-02, 1.334407891882760e-02, 1.334407891882411e-02, 8.025809123555732e-03, 8.025809123554318e-03, 8.025809123554217e-03, 1.070887983557521e-02, 1.070887983557467e-02, 1.070887983557649e-02, 1.309736406162602e-02, 1.309736406162684e-02, 1.309736406162661e-02, 8.885655025694786e-03, 8.885655025694404e-03, 8.885655025693740e-03, 1.344113214741205e-02, 1.344113214741205e-02, 1.344113214741270e-02, 5.698568685004529e-03, 5.698568685005180e-03, 5.698568685005053e-03, 3.847402885188293e-03, 3.847402885188307e-03, 3.847402885188477e-03, 4.769112898361405e-03, 4.769112898361265e-03, 4.769112898362141e-03, 1.118686556204748e-02, 1.118686556204881e-02, 1.118686556204810e-02, 5.635998890136026e-03, 5.635998890136154e-03, 5.635998890136069e-03, 5.044085510105154e-03, 5.044085510104333e-03, 5.044085510104730e-03, 1.014203848072567e-02, 1.014203848072546e-02, 1.014203848072654e-02, 2.681590717335358e-03, 2.681590717334821e-03, 2.681590717335202e-03, 4.059907280598535e-03, 4.059907280598903e-03, 4.059907280598535e-03, 3.336448036565914e-03, 3.336448036565589e-03, 3.336448036566042e-03, 1.076375181138509e-02, 1.076375181138425e-02, 1.076375181138517e-02, 6.573170212491274e-03, 6.573170212491260e-03, 6.573170212491358e-03, 8.877430748418416e-04, 8.877430748418500e-04, 8.877430748417468e-04, 1.503300080640668e-03, 1.503300080640583e-03, 1.503300080640908e-03, 5.744672404119040e-03, 5.744672404119181e-03, 5.744672404119252e-03, 1.906496203051725e-03, 1.906496203050283e-03, 1.906496203049802e-03, 2.124665847459818e-03, 2.124665847459747e-03, 2.124665847459747e-03, 7.203086774524117e-04, 7.203086774524287e-04, 7.203086774520808e-04, 6.473262420273394e-04, 6.473262420272856e-04, 6.473262420267072e-04, 2.381499975344257e-03, 2.381499975344469e-03, 2.381499975345459e-03, 3.013274913110685e-03, 3.013274913110755e-03, 3.013274913111859e-03, 2.067897521108355e-03, 2.067897521107776e-03, 2.067897521108949e-03, 1.499773420159665e-02, 1.925991642486046e-02, 1.254797412755794e-02, 1.209377740627947e-02}; -void mesh_t::CubatureNodesTet3D(int cubTetN, int *_cubNp, dfloat **_cubr, dfloat **_cubs, dfloat **_cubt, dfloat **_cubw){ +void mesh_t::CubatureNodesTet3D(const int cubTetN, + int& _cubNp, + memory& _cubr, + memory& _cubs, + memory& _cubt, + memory& _cubw){ - if (cubTetN>15) - LIBP_ABORT(string("Requested Cubature order unavailable.")) + LIBP_ABORT("Requested Cubature order unavailable.", + cubTetN>15); - int cubTetNp = cubTetNps[cubTetN-1]; + _cubNp = cubTetNps[cubTetN-1]; - *_cubNp = cubTetNp; - - *_cubr = (dfloat*) calloc(cubTetNp, sizeof(dfloat)); - *_cubs = (dfloat*) calloc(cubTetNp, sizeof(dfloat)); - *_cubt = (dfloat*) calloc(cubTetNp, sizeof(dfloat)); - *_cubw = (dfloat*) calloc(cubTetNp, sizeof(dfloat)); + _cubr.malloc(_cubNp); + _cubs.malloc(_cubNp); + _cubt.malloc(_cubNp); + _cubw.malloc(_cubNp); const dfloat *cubTetR=NULL, *cubTetS=NULL, *cubTetT=NULL, *cubTetW=NULL; switch(cubTetN){ @@ -933,13 +1163,15 @@ void mesh_t::CubatureNodesTet3D(int cubTetN, int *_cubNp, dfloat **_cubr, dfloat case 14: cubTetR = cubR14; cubTetS = cubS14; cubTetT = cubT14; cubTetW = cubW14; break; case 15: cubTetR = cubR15; cubTetS = cubS15; cubTetT = cubT15; cubTetW = cubW15; break; default: - LIBP_ABORT(string("Requested Cubature order unavailable.")) + LIBP_FORCE_ABORT("Requested Cubature order unavailable."); } - for(int n=0;n& _r, + memory& _s){ EquispacedNodesTri2D(_N, _r, _s); //make equispaced nodes on reference triangle - WarpBlendTransformTri2D(_N, _Np, _r, _s); //apply warp&blend transform + WarpBlendTransformTri2D(_N, _r, _s); //apply warp&blend transform } -void mesh_t::FaceNodesTri2D(int _N, dfloat *_r, dfloat *_s, int *_faceNodes){ - int _Nfp = _N+1; - int _Np = (_N+1)*(_N+2)/2; +void mesh_t::FaceNodesTri2D(const int _N, + const memory _r, + const memory _s, + memory& _faceNodes){ + const int _Nfp = _N+1; + const int _Np = (_N+1)*(_N+2)/2; int cnt[3]; for (int i=0;i<3;i++) cnt[i]=0; - dfloat deps = 1.; - while((1.+deps)>1.) - deps *= 0.5; - - const dfloat NODETOL = 1000.*deps; + const dfloat NODETOL = 1.0e-5; + _faceNodes.malloc(3*_Nfp); for (int n=0;n<_Np;n++) { if(fabs(_s[n]+1) _r, + const memory _s, + memory& _vertexNodes){ + const int _Np = (_N+1)*(_N+2)/2; dfloat deps = 1.; while((1.+deps)>1.) @@ -70,6 +73,7 @@ void mesh_t::VertexNodesTri2D(int _N, dfloat *_r, dfloat *_s, int *_vertexNodes) const dfloat NODETOL = 1000.*deps; + _vertexNodes.malloc(3); for(int n=0;n<_Np;++n){ if( (_r[n]+1)*(_r[n]+1)+(_s[n]+1)*(_s[n]+1) _r, + const memory _s, + const memory _faceNodes, + const memory _faceVertices, + memory& R){ + + const int _Nfaces = 3; + const int _Nverts = 3; + const int _NfaceVertices = 2; + + const int _Nfp = _faceNodes.length()/_Nfaces; + + const dfloat NODETOL = 1.0e-5; + + dfloat V[2] = {-1.0, 1.0}; + + dfloat EX0[_Nverts]; + dfloat EX1[_Nverts]; + + memory x0(_Nfp); + memory x1(_Nfp); + + R.malloc(_Nfaces*_Nfaces*_NfaceVertices*_Nfp); + + for (int fM=0;fM<_Nfaces;fM++) { + + for (int v=0;v<_Nverts;v++) { + EX0[v] = 0.0; + } + //setup top element with face fM on the bottom + for (int v=0;v<_NfaceVertices;v++) { + int fv = _faceVertices[fM*_NfaceVertices + v]; + EX0[fv] = V[v]; + } + + for(int n=0;n<_Nfp;++n){ /* for each face node */ + const int fn = _faceNodes[fM*_Nfp+n]; + + /* (r,s) coordinates of interpolation nodes*/ + dfloat rn = _r[fn]; + dfloat sn = _s[fn]; + + /* physical coordinate of interpolation node */ + x0[n] = -0.5*(rn+sn)*EX0[0] + + 0.5*(1+rn)*EX0[1] + + 0.5*(1+sn)*EX0[2]; + } + + for (int fP=0;fP<_Nfaces;fP++) { /*For each neighbor face */ + for (int rot=0;rot<_NfaceVertices;rot++) { /* For each face rotation */ + // Zero vertices + for (int v=0;v<_Nverts;v++) { + EX1[v] = 0.0; + } + //setup bottom element with face fP on the top + for (int v=0;v<_NfaceVertices;v++) { + int fv = _faceVertices[fP*_NfaceVertices + ((v+rot)%_NfaceVertices)]; + EX1[fv] = V[v]; + } + + for(int n=0;n<_Nfp;++n){ /* for each node */ + const int fn = _faceNodes[fP*_Nfp+n]; + + /* (r,s,t) coordinates of interpolation nodes*/ + dfloat rn = _r[fn]; + dfloat sn = _s[fn]; + + /* physical coordinate of interpolation node */ + x1[n] = -0.5*(rn+sn)*EX1[0] + + 0.5*(1+rn)*EX1[1] + + 0.5*(1+sn)*EX1[2]; + } + + /* for each node on this face find the neighbor node */ + for(int n=0;n<_Nfp;++n){ + const dfloat xM = x0[n]; + + int m=0; + for(;m<_Nfp;++m){ /* for each neighbor node */ + const dfloat xP = x1[m]; + + /* distance between target and neighbor node */ + const dfloat dist = pow(xM-xP,2); + + /* if neighbor node is close to target, match */ + if(distNODETOL); + } + } + } + } +} + // Create equidistributed nodes on reference triangle -void mesh_t::EquispacedNodesTri2D(int _N, dfloat *_r, dfloat *_s){ +void mesh_t::EquispacedNodesTri2D(const int _N, + memory& _r, + memory& _s){ + + const int _Np = (_N+1)*(_N+2)/2; + + _r.malloc(_Np); + _s.malloc(_Np); int sk = 0; for (int n=0;n<_N+1;n++) { @@ -93,8 +216,11 @@ void mesh_t::EquispacedNodesTri2D(int _N, dfloat *_r, dfloat *_s){ } } -void mesh_t::EquispacedEToVTri2D(int _N, int *_EToV){ - int _Nverts = 3; +void mesh_t::EquispacedEToVTri2D(const int _N, memory& _EToV){ + const int _Nverts = 3; + const int _Nelements = _N*_N; + + _EToV.malloc(_Nelements*_Nverts); int cnt=0; int sk=0; @@ -120,7 +246,10 @@ void mesh_t::EquispacedEToVTri2D(int _N, int *_EToV){ } } -void mesh_t::SEMFEMNodesTri2D(int _N, int *_Np, dfloat **_r, dfloat **_s){ +void mesh_t::SEMFEMNodesTri2D(const int _N, + int& _Np, + memory& _r, + memory& _s){ const dfloat alpopt[12] = {0.0000, 5.0000, 3.0000, 2.2073, 2.5259, 2.7113, 2.4368, 2.4564, 2.3948, 2.4346, 2.4653, 2.4691}; @@ -138,10 +267,9 @@ void mesh_t::SEMFEMNodesTri2D(int _N, int *_Np, dfloat **_r, dfloat **_s){ const dfloat NODETOL = 1000.*deps; - *_Np = (_N+1)*(_N+6)/2; - - *_r = (dfloat *) malloc((*_Np)*sizeof(dfloat)); - *_s = (dfloat *) malloc((*_Np)*sizeof(dfloat)); + _Np = (_N+1)*(_N+6)/2; + _r.malloc(_Np); + _s.malloc(_Np); int sk=0; //Order N+1 boundary @@ -151,8 +279,8 @@ void mesh_t::SEMFEMNodesTri2D(int _N, int *_Np, dfloat **_r, dfloat **_s){ dfloat ss = -1.0 + 2.0*n/(_N+1); if((fabs(ss+1)& _EToV){ + const int _Nverts = 3; - *_NelFEM = 6+6*(_N-1)+(_N-1)*(_N-1); - *_EToV = (int*) malloc((*_NelFEM)*_Nverts*sizeof(int)); + _NelFEM = 6+6*(_N-1)+(_N-1)*(_N-1); + _EToV.malloc(_NelFEM*_Nverts); //start with corner quads int cnt=0; int corner = 3*(_N+1); //first interior point - (*_EToV)[cnt*_Nverts+0] = 0; - (*_EToV)[cnt*_Nverts+1] = 1; - (*_EToV)[cnt*_Nverts+2] = _N+2; + _EToV[cnt*_Nverts+0] = 0; + _EToV[cnt*_Nverts+1] = 1; + _EToV[cnt*_Nverts+2] = _N+2; cnt++; - (*_EToV)[cnt*_Nverts+0] = 1; - (*_EToV)[cnt*_Nverts+1] = corner; - (*_EToV)[cnt*_Nverts+2] = _N+2; + _EToV[cnt*_Nverts+0] = 1; + _EToV[cnt*_Nverts+1] = corner; + _EToV[cnt*_Nverts+2] = _N+2; cnt++; corner += _N-1; //bottom right interior point - (*_EToV)[cnt*_Nverts+0] = _N; - (*_EToV)[cnt*_Nverts+1] = _N+1; - (*_EToV)[cnt*_Nverts+2] = _N+3; + _EToV[cnt*_Nverts+0] = _N; + _EToV[cnt*_Nverts+1] = _N+1; + _EToV[cnt*_Nverts+2] = _N+3; cnt++; - (*_EToV)[cnt*_Nverts+0] = _N; - (*_EToV)[cnt*_Nverts+1] = _N+3; - (*_EToV)[cnt*_Nverts+2] = corner; + _EToV[cnt*_Nverts+0] = _N; + _EToV[cnt*_Nverts+1] = _N+3; + _EToV[cnt*_Nverts+2] = corner; cnt++; corner = (_N+1)*(_N+6)/2-1; //top interior point - (*_EToV)[cnt*_Nverts+0] = 3*_N; - (*_EToV)[cnt*_Nverts+1] = 3*_N+1; - (*_EToV)[cnt*_Nverts+2] = 3*_N+2; + _EToV[cnt*_Nverts+0] = 3*_N; + _EToV[cnt*_Nverts+1] = 3*_N+1; + _EToV[cnt*_Nverts+2] = 3*_N+2; cnt++; - (*_EToV)[cnt*_Nverts+0] = 3*_N; - (*_EToV)[cnt*_Nverts+1] = corner; - (*_EToV)[cnt*_Nverts+2] = 3*_N+1; + _EToV[cnt*_Nverts+0] = 3*_N; + _EToV[cnt*_Nverts+1] = corner; + _EToV[cnt*_Nverts+2] = 3*_N+1; cnt++; //next the edges corner = 3*(_N+1); //first interior point int inc = 1; // increment to next interior point along this edge for (int i=0;i<_N-1;i++) { - (*_EToV)[cnt*_Nverts+0] = i+1; - (*_EToV)[cnt*_Nverts+1] = i+2; - (*_EToV)[cnt*_Nverts+2] = corner; + _EToV[cnt*_Nverts+0] = i+1; + _EToV[cnt*_Nverts+1] = i+2; + _EToV[cnt*_Nverts+2] = corner; cnt++; - (*_EToV)[cnt*_Nverts+0] = i+2; - (*_EToV)[cnt*_Nverts+1] = corner+inc; - (*_EToV)[cnt*_Nverts+2] = corner; + _EToV[cnt*_Nverts+0] = i+2; + _EToV[cnt*_Nverts+1] = corner+inc; + _EToV[cnt*_Nverts+2] = corner; cnt++; corner += inc; } @@ -229,14 +359,14 @@ void mesh_t::SEMFEMEToVTri2D(int _N, int *_NelFEM, int **_EToV){ corner = 3*(_N+1); //first interior point inc = _N; // increment to next interior point along this edge for (int i=0;i<_N-1;i++) { - (*_EToV)[cnt*_Nverts+0] = _N+2+2*i; - (*_EToV)[cnt*_Nverts+1] = corner; - (*_EToV)[cnt*_Nverts+2] = _N+4+2*i; + _EToV[cnt*_Nverts+0] = _N+2+2*i; + _EToV[cnt*_Nverts+1] = corner; + _EToV[cnt*_Nverts+2] = _N+4+2*i; cnt++; - (*_EToV)[cnt*_Nverts+0] = corner; - (*_EToV)[cnt*_Nverts+1] = corner+inc; - (*_EToV)[cnt*_Nverts+2] = _N+4+2*i; + _EToV[cnt*_Nverts+0] = corner; + _EToV[cnt*_Nverts+1] = corner+inc; + _EToV[cnt*_Nverts+2] = _N+4+2*i; cnt++; corner += inc; inc--; @@ -245,14 +375,14 @@ void mesh_t::SEMFEMEToVTri2D(int _N, int *_NelFEM, int **_EToV){ corner = 3*(_N+1)+_N-1; //bottom right interior point inc = _N-1; // increment to next interior point along this edge for (int i=0;i<_N-1;i++) { - (*_EToV)[cnt*_Nverts+0] = corner; - (*_EToV)[cnt*_Nverts+1] = _N+3+2*i; - (*_EToV)[cnt*_Nverts+2] = _N+5+2*i; + _EToV[cnt*_Nverts+0] = corner; + _EToV[cnt*_Nverts+1] = _N+3+2*i; + _EToV[cnt*_Nverts+2] = _N+5+2*i; cnt++; - (*_EToV)[cnt*_Nverts+0] = corner; - (*_EToV)[cnt*_Nverts+1] = _N+5+2*i; - (*_EToV)[cnt*_Nverts+2] = corner+inc; + _EToV[cnt*_Nverts+0] = corner; + _EToV[cnt*_Nverts+1] = _N+5+2*i; + _EToV[cnt*_Nverts+2] = corner+inc; cnt++; corner += inc; inc--; @@ -265,15 +395,15 @@ void mesh_t::SEMFEMEToVTri2D(int _N, int *_NelFEM, int **_EToV){ int shift = _N-j; //number of nodes in this row for (int i=0;i<_N-j-1;i++) { - (*_EToV)[cnt*_Nverts+0] = sk ; - (*_EToV)[cnt*_Nverts+1] = sk+1; - (*_EToV)[cnt*_Nverts+2] = sk+shift; + _EToV[cnt*_Nverts+0] = sk ; + _EToV[cnt*_Nverts+1] = sk+1; + _EToV[cnt*_Nverts+2] = sk+shift; cnt++; if (i!=_N-j-2) { - (*_EToV)[cnt*_Nverts+0] = sk+1; - (*_EToV)[cnt*_Nverts+1] = sk+shift+1; - (*_EToV)[cnt*_Nverts+2] = sk+shift; + _EToV[cnt*_Nverts+0] = sk+1; + _EToV[cnt*_Nverts+1] = sk+shift+1; + _EToV[cnt*_Nverts+2] = sk+shift; cnt++; } sk++; @@ -285,7 +415,9 @@ void mesh_t::SEMFEMEToVTri2D(int _N, int *_NelFEM, int **_EToV){ // ------------------------------------------------------------------------ // ORTHONORMAL BASIS POLYNOMIALS // ------------------------------------------------------------------------ -void mesh_t::OrthonormalBasisTri2D(dfloat _r, dfloat _s, int i, int j, dfloat *P){ +void mesh_t::OrthonormalBasisTri2D(const dfloat _r, const dfloat _s, + const int i, const int j, + dfloat& P){ dfloat a,b; if(_s != 1.) a = 2.*(1.+_r)/(1.-_s)-1.; @@ -293,10 +425,12 @@ void mesh_t::OrthonormalBasisTri2D(dfloat _r, dfloat _s, int i, int j, dfloat *P a = -1.; b=_s; - *P = sqrt(2.0)*JacobiP(a,0,0,i)*JacobiP(b,2*i+1,0,j)*pow(1.-b,i); + P = sqrt(2.0)*JacobiP(a,0,0,i)*JacobiP(b,2*i+1,0,j)*pow(1.-b,i); } -void mesh_t::GradOrthonormalBasisTri2D(dfloat _r, dfloat _s, int i, int j, dfloat *Pr, dfloat *Ps){ +void mesh_t::GradOrthonormalBasisTri2D(const dfloat _r, const dfloat _s, + const int i, const int j, + dfloat& Pr, dfloat& Ps){ dfloat a,b; if(_s != 1.) a = 2.*(1.+_r)/(1.-_s)-1.; @@ -309,56 +443,68 @@ void mesh_t::GradOrthonormalBasisTri2D(dfloat _r, dfloat _s, int i, int j, dfloa // r-derivative // d/dr = da/dr d/da + db/dr d/db = (2/(1-s)) d/da = (2/(1-b)) d/da - (*Pr) = dfa*gb; + Pr = dfa*gb; if(i>0) - (*Pr) = (*Pr)*pow(0.5*(1-b),i-1); + Pr = Pr*pow(0.5*(1-b),i-1); // s-derivative // d/ds = ((1+a)/2)/((1-b)/2) d/da + d/db - (*Ps) = dfa*(gb*(0.5*(1+a))); + Ps = dfa*(gb*(0.5*(1+a))); if(i>0) - (*Ps) = (*Ps)*pow(0.5*(1-b),i-1); + Ps = Ps*pow(0.5*(1-b),i-1); dfloat tmp = dgb*pow(0.5*(1-b),i); if(i>0) tmp = tmp-0.5*i*gb*pow(0.5*(1-b),i-1); - (*Ps) = (*Ps)+fa*tmp; + Ps = Ps+fa*tmp; // Normalize - (*Pr) *= pow(2,i+0.5); (*Ps) *= pow(2,i+0.5); + Pr *= pow(2,i+0.5); Ps *= pow(2,i+0.5); } // ------------------------------------------------------------------------ // 2D VANDERMONDE MATRICES // ------------------------------------------------------------------------ -void mesh_t::VandermondeTri2D(int _N, int Npoints, dfloat *_r, dfloat *_s, dfloat *V){ +void mesh_t::VandermondeTri2D(const int _N, + const memory _r, + const memory _s, + memory& V){ - int _Np = (_N+1)*(_N+2)/2; + const int _Np = (_N+1)*(_N+2)/2; + const int Npoints = _r.length(); + V.malloc(Npoints*_Np); for(int n=0; n _r, + const memory _s, + memory& Vr, + memory& Vs){ - int _Np = (_N+1)*(_N+2)/2; + const int _Np = (_N+1)*(_N+2)/2; + const int Npoints = _r.length(); + Vr.malloc(Npoints*_Np); + Vs.malloc(Npoints*_Np); for(int n=0; n V, + memory& _MM){ // massMatrix = inv(V')*inv(V) = inv(V*V') + _MM.malloc(_Np*_Np); for(int n=0;n<_Np;++n){ for(int m=0;m<_Np;++m){ dfloat res = 0; @@ -380,12 +529,15 @@ void mesh_t::MassMatrixTri2D(int _Np, dfloat *V, dfloat *_MM){ _MM[n*_Np + m] = res; } } - matrixInverse(_Np, _MM); + linAlg_t::matrixInverse(_Np, _MM); } -void mesh_t::invMassMatrixTri2D(int _Np, dfloat *V, dfloat *_invMM){ +void mesh_t::invMassMatrixTri2D(const int _Np, + const memory V, + memory& _invMM){ // massMatrix^{-1} = V*V' + _invMM.malloc(_Np*_Np); for(int n=0;n<_Np;++n){ for(int m=0;m<_Np;++m){ dfloat res = 0; @@ -397,40 +549,41 @@ void mesh_t::invMassMatrixTri2D(int _Np, dfloat *V, dfloat *_invMM){ } } -void mesh_t::DmatrixTri2D(int _N, int Npoints, dfloat *_r, dfloat *_s, - dfloat *_Dr, dfloat *_Ds){ +void mesh_t::DmatrixTri2D(const int _N, + const memory _r, + const memory _s, + memory& _D){ - int _Np = (_N+1)*(_N+2)/2; + const int _Np = (_N+1)*(_N+2)/2; - dfloat *V = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat)); - dfloat *Vr = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat)); - dfloat *Vs = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat)); - - VandermondeTri2D(_N, Npoints, _r, _s, V); - GradVandermondeTri2D(_N, Npoints, _r, _s, Vr, Vs); + memory V, Vr, Vs; + VandermondeTri2D(_N, _r, _s, V); + GradVandermondeTri2D(_N, _r, _s, Vr, Vs); //Dr = Vr/V, Ds = Vs/V - matrixRightSolve(_Np, _Np, Vr, _Np, _Np, V, _Dr); - matrixRightSolve(_Np, _Np, Vs, _Np, _Np, V, _Ds); - - free(V); free(Vr); free(Vs); + _D.malloc(2*_Np*_Np); + memory _Dr = _D + 0*_Np*_Np; + memory _Ds = _D + 1*_Np*_Np; + linAlg_t::matrixRightSolve(_Np, _Np, Vr, _Np, _Np, V, _Dr); + linAlg_t::matrixRightSolve(_Np, _Np, Vs, _Np, _Np, V, _Ds); } -void mesh_t::LIFTmatrixTri2D(int _N, int *_faceNodes, - dfloat *_r, dfloat *_s, dfloat *_LIFT){ +void mesh_t::LIFTmatrixTri2D(const int _N, + const memory _faceNodes, + const memory _r, + const memory _s, + memory& _LIFT){ - int _Nfp = (_N+1); - int _Np = (_N+1)*(_N+2)/2; - int _Nfaces = 3; + const int _Nfp = (_N+1); + const int _Np = (_N+1)*(_N+2)/2; + const int _Nfaces = 3; - dfloat *E = (dfloat *) calloc(_Np*_Nfaces*_Nfp, sizeof(dfloat)); + memory E(_Np*_Nfaces*_Nfp, 0); - dfloat *r1D = (dfloat *) malloc(_Nfp*sizeof(dfloat)); - dfloat *V1D = (dfloat *) malloc(_Nfp*_Nfp*sizeof(dfloat)); - dfloat *MM1D = (dfloat *) malloc(_Nfp*_Nfp*sizeof(dfloat)); + memory r1D(_Nfp); for (int f=0;f<_Nfaces;f++) { - dfloat *rFace; + memory rFace; if (f==0) rFace = _r; if (f==1) rFace = _r; if (f==2) rFace = _s; @@ -438,7 +591,8 @@ void mesh_t::LIFTmatrixTri2D(int _N, int *_faceNodes, for (int i=0;i<_Nfp;i++) r1D[i] = rFace[_faceNodes[f*_Nfp+i]]; - Vandermonde1D(_N, _Nfp, r1D, V1D); + memory V1D, MM1D; + Vandermonde1D(_N, r1D, V1D); MassMatrix1D(_Nfp, V1D, MM1D); for (int j=0;j<_Nfp;j++) { @@ -449,9 +603,10 @@ void mesh_t::LIFTmatrixTri2D(int _N, int *_faceNodes, } } - dfloat *V = (dfloat *) malloc(_Np*_Np*sizeof(dfloat)); - VandermondeTri2D(_N, _Np, _r, _s, V); + memory V; + VandermondeTri2D(_N, _r, _s, V); + _LIFT.malloc(_Np*_Nfaces*_Nfp); for (int n=0;n<_Np;n++) { for (int m=0;m<_Nfaces*_Nfp;m++) { @@ -465,16 +620,18 @@ void mesh_t::LIFTmatrixTri2D(int _N, int *_faceNodes, } } } - - free(V); free(r1D); free(V1D); free(MM1D); free(E); } -void mesh_t::SurfaceMassMatrixTri2D(int _N, dfloat *_MM, dfloat *_LIFT, dfloat *_sM){ +void mesh_t::SurfaceMassMatrixTri2D(const int _N, + const memory _MM, + const memory _LIFT, + memory& _sM){ - int _Nfp = (_N+1); - int _Np = (_N+1)*(_N+2)/2; - int _Nfaces = 3; + const int _Nfp = (_N+1); + const int _Np = (_N+1)*(_N+2)/2; + const int _Nfaces = 3; + _sM.malloc(_Np*_Nfaces*_Nfp); for (int n=0;n<_Np;n++) { for (int m=0;m<_Nfp*_Nfaces;m++) { _sM[m+n*_Nfp*_Nfaces] = 0; @@ -485,11 +642,18 @@ void mesh_t::SurfaceMassMatrixTri2D(int _N, dfloat *_MM, dfloat *_LIFT, dfloat * } } -void mesh_t::SmatrixTri2D(int _N, dfloat *_Dr, dfloat *_Ds, dfloat *_MM, - dfloat *_Srr, dfloat *_Srs, dfloat *_Sss){ +void mesh_t::SmatrixTri2D(const int _N, + const memory _Dr, + const memory _Ds, + const memory _MM, + memory& _S){ - int _Np = (_N+1)*(_N+2)/2; + const int _Np = (_N+1)*(_N+2)/2; + _S.malloc(3*_Np*_Np, 0.0); + memory _Srr = _S + 0*_Np*_Np; + memory _Srs = _S + 1*_Np*_Np; + memory _Sss = _S + 2*_Np*_Np; for (int n=0;n<_Np;n++) { for (int m=0;m<_Np;m++) { for (int k=0;k<_Np;k++) { @@ -504,56 +668,61 @@ void mesh_t::SmatrixTri2D(int _N, dfloat *_Dr, dfloat *_Ds, dfloat *_MM, } } -void mesh_t::InterpolationMatrixTri2D(int _N, - int NpointsIn, dfloat *rIn, dfloat *sIn, - int NpointsOut, dfloat *rOut, dfloat *sOut, - dfloat *I){ +void mesh_t::InterpolationMatrixTri2D(const int _N, + const memory rIn, + const memory sIn, + const memory rOut, + const memory sOut, + memory& I){ - int _Np = (_N+1)*(_N+2)/2; + const int _Np = (_N+1)*(_N+2)/2; - // need NpointsIn = _Np - if (NpointsIn != _Np) - LIBP_ABORT(string("Invalid Interplation operator requested.")) + const int NpointsIn = rIn.length(); + const int NpointsOut = rOut.length(); - dfloat *VIn = (dfloat*) malloc(NpointsIn*_Np*sizeof(dfloat)); - dfloat *VOut= (dfloat*) malloc(NpointsOut*_Np*sizeof(dfloat)); - - VandermondeTri2D(_N, NpointsIn, rIn, sIn, VIn); - VandermondeTri2D(_N, NpointsOut, rOut, sOut, VOut); + // need NpointsIn = _Np + LIBP_ABORT("Invalid Interplation operator requested.", + NpointsIn != _Np); - matrixRightSolve(NpointsOut, _Np, VOut, NpointsIn, _Np, VIn, I); + memory VIn; + memory VOut; + VandermondeTri2D(_N, rIn, sIn, VIn); + VandermondeTri2D(_N, rOut, sOut, VOut); - free(VIn); free(VOut); + I.malloc(NpointsIn*NpointsOut); + linAlg_t::matrixRightSolve(NpointsOut, _Np, VOut, + NpointsIn, _Np, VIn, I); } -void mesh_t::DegreeRaiseMatrixTri2D(int Nc, int Nf, dfloat *P){ - - int Npc = (Nc+1)*(Nc+2)/2; - int Npf = (Nf+1)*(Nf+2)/2; - - dfloat *rc = (dfloat *) malloc(Npc*sizeof(dfloat)); - dfloat *sc = (dfloat *) malloc(Npc*sizeof(dfloat)); - dfloat *rf = (dfloat *) malloc(Npf*sizeof(dfloat)); - dfloat *sf = (dfloat *) malloc(Npf*sizeof(dfloat)); +void mesh_t::DegreeRaiseMatrixTri2D(const int Nc, const int Nf, + memory& P){ + memory rc, sc; + memory rf, sf; NodesTri2D(Nc, rc, sc); NodesTri2D(Nf, rf, sf); - InterpolationMatrixTri2D(Nc, Npc, rc, sc, Npf, rf, sf, P); - - free(rc); free(sc); free(rf); free(sf); + InterpolationMatrixTri2D(Nc, rc, sc, rf, sf, P); } -void mesh_t::CubaturePmatrixTri2D(int _N, int _Np, dfloat *_r, dfloat *_s, - int _cubNp, dfloat *_cubr, dfloat *_cubs, dfloat *_cubProject){ +void mesh_t::CubaturePmatrixTri2D(const int _N, + const memory _r, + const memory _s, + const memory _cubr, + const memory _cubs, + memory& _cubProject){ + + const int _Np = (_N+1)*(_N+2)/2; + const int _cubNp = _cubr.length(); - dfloat *V = (dfloat*) malloc(_Np*_Np*sizeof(dfloat)); - VandermondeTri2D(_N, _Np, _r, _s, V); + memory V; + VandermondeTri2D(_N, _r, _s, V); - dfloat *cubV = (dfloat*) malloc(_cubNp*_Np*sizeof(dfloat)); - VandermondeTri2D(_N, _cubNp, _cubr, _cubs, cubV); + memory cubV; + VandermondeTri2D(_N, _cubr, _cubs, cubV); // cubProject = V*cV' %% relies on (transpose(cV)*diag(cubw)*cV being the identity) + _cubProject.malloc(_Np*_cubNp); for(int n=0;n<_Np;++n){ for(int m=0;m<_cubNp;++m){ dfloat resP = 0; @@ -563,24 +732,30 @@ void mesh_t::CubaturePmatrixTri2D(int _N, int _Np, dfloat *_r, dfloat *_s, _cubProject[n*_cubNp+m] = resP; } } - free(V); free(cubV); } -void mesh_t::CubatureWeakDmatricesTri2D(int _N, int _Np, dfloat *_r, dfloat *_s, - int _cubNp, dfloat *_cubr, dfloat *_cubs, - dfloat *_cubPDrT, dfloat *_cubPDsT){ +void mesh_t::CubatureWeakDmatricesTri2D(const int _N, + const memory _r, + const memory _s, + const memory _cubr, + const memory _cubs, + memory& _cubPDT){ - dfloat *V = (dfloat*) malloc(_Np*_Np*sizeof(dfloat)); - VandermondeTri2D(_N, _Np, _r, _s, V); + const int _Np = (_N+1)*(_N+2)/2; + const int _cubNp = _cubr.length(); - dfloat *cubV = (dfloat*) malloc(_cubNp*_Np*sizeof(dfloat)); - dfloat *cubVr = (dfloat*) malloc(_cubNp*_Np*sizeof(dfloat)); - dfloat *cubVs = (dfloat*) malloc(_cubNp*_Np*sizeof(dfloat)); - VandermondeTri2D(_N, _cubNp, _cubr, _cubs, cubV); - GradVandermondeTri2D(_N, _cubNp, _cubr, _cubs, cubVr, cubVs); + memory V; + VandermondeTri2D(_N, _r, _s, V); + + memory cubV, cubVr, cubVs; + VandermondeTri2D(_N, _cubr, _cubs, cubV); + GradVandermondeTri2D(_N, _cubr, _cubs, cubVr, cubVs); // cubPDrT = V*transpose(cVr); // cubPDsT = V*transpose(cVs); + _cubPDT.malloc(2*_Np*_cubNp); + memory _cubPDrT = _cubPDT + 0*_Np*_cubNp; + memory _cubPDsT = _cubPDT + 1*_Np*_cubNp; for(int n=0;n<_Np;++n){ for(int m=0;m<_cubNp;++m){ dfloat resPDrT = 0, resPDsT = 0; @@ -593,22 +768,28 @@ void mesh_t::CubatureWeakDmatricesTri2D(int _N, int _Np, dfloat *_r, dfloat *_s, _cubPDsT[n*_cubNp+m] = resPDsT; } } - free(V); free(cubV); free(cubVr); free(cubVs); } -void mesh_t::CubatureSurfaceMatricesTri2D(int _N, int _Np, dfloat *_r, dfloat *_s, int *_faceNodes, - int _intNfp, dfloat *_intr, dfloat *_intw, - dfloat *_intInterp, dfloat *_intLIFT){ +void mesh_t::CubatureSurfaceMatricesTri2D(const int _N, + const memory _r, + const memory _s, + const memory _faceNodes, + const memory _intr, + const memory _intw, + memory& _intInterp, + memory& _intLIFT){ - int _Nfaces = 3; - int _Nfp = _N+1; + const int _Np = (_N+1)*(_N+2)/2; + const int _Nfaces = 3; + const int _Nfp = _N+1; + const int _intNfp = _intr.length(); - dfloat *V = (dfloat*) malloc(_Np*_Np*sizeof(dfloat)); - VandermondeTri2D(_N, _Np, _r, _s, V); + memory V; + VandermondeTri2D(_N, _r, _s, V); - dfloat *ir = (dfloat*) calloc(_intNfp*_Nfaces, sizeof(dfloat)); - dfloat *is = (dfloat*) calloc(_intNfp*_Nfaces, sizeof(dfloat)); - dfloat *iw = (dfloat*) calloc(_intNfp*_Nfaces, sizeof(dfloat)); + memory ir(_intNfp*_Nfaces); + memory is(_intNfp*_Nfaces); + memory iw(_intNfp*_Nfaces); for(int n=0;n<_intNfp;++n){ ir[0*_intNfp + n] = _intr[n]; @@ -624,9 +805,10 @@ void mesh_t::CubatureSurfaceMatricesTri2D(int _N, int _Np, dfloat *_r, dfloat *_ iw[2*_intNfp + n] = _intw[n]; } - dfloat *sInterp = (dfloat*) malloc(_intNfp*_Nfaces*_Np*sizeof(dfloat)); - InterpolationMatrixTri2D(_N, _Np, _r, _s, _Nfaces*_intNfp, ir, is, sInterp); + memory sInterp; + InterpolationMatrixTri2D(_N, _r, _s, ir, is, sInterp); + _intInterp.malloc(_Nfaces*_intNfp*_Nfp); for(int n=0;n<_intNfp;++n){ for(int m=0;m<_Nfp;++m){ _intInterp[0*_intNfp*_Nfp + n*_Nfp + m] = sInterp[(n+0*_intNfp)*_Np+_faceNodes[0*_Nfp+m]]; @@ -637,6 +819,7 @@ void mesh_t::CubatureSurfaceMatricesTri2D(int _N, int _Np, dfloat *_r, dfloat *_ // integration node lift matrix //iLIFT = V*V'*sInterp'*diag(iw(:)); + _intLIFT.malloc(_Nfaces*_intNfp*_Np); for(int n=0;n<_Nfaces*_intNfp;++n){ for(int m=0;m<_Np;++m){ _intLIFT[m*_Nfaces*_intNfp+n] = 0.0; @@ -647,19 +830,22 @@ void mesh_t::CubatureSurfaceMatricesTri2D(int _N, int _Np, dfloat *_r, dfloat *_ } } } - - free(V); free(ir); free(is); free(iw); free(sInterp); } -void mesh_t::SEMFEMInterpMatrixTri2D(int _N, - int _Np, dfloat *_r, dfloat *_s, - int _NpFEM, dfloat *_rFEM, dfloat *_sFEM, - dfloat *I){ +void mesh_t::SEMFEMInterpMatrixTri2D(const int _N, + const memory _r, + const memory _s, + const memory _rFEM, + const memory _sFEM, + memory& I){ - dfloat *IQN = (dfloat*) malloc(_NpFEM*_Np*sizeof(dfloat)); - InterpolationMatrixTri2D(_N, _Np, _r, _s, _NpFEM, _rFEM, _sFEM, IQN); + const int _Np = (_N+1)*(_N+2)/2; + const int _NpFEM = _rFEM.length(); - dfloat *IQTIQ = (dfloat*) malloc(_Np*_Np*sizeof(dfloat)); + memory IQN; + InterpolationMatrixTri2D(_N, _r, _s, _rFEM, _sFEM, IQN); + + memory IQTIQ(_Np*_Np); // IQTIQ = IQN'*IQN for(int n=0;n<_Np;++n){ for(int m=0;m<_Np;++m){ @@ -671,9 +857,8 @@ void mesh_t::SEMFEMInterpMatrixTri2D(int _N, } // I = IQN/(IQN'*IQN) - pseudo inverse - matrixRightSolve(_NpFEM, _Np, IQN, _Np, _Np, IQTIQ, I); - - free(IQN); free(IQTIQ); + I.malloc(_NpFEM*_Np); + linAlg_t::matrixRightSolve(_NpFEM, _Np, IQN, _Np, _Np, IQTIQ, I); } // ------------------------------------------------------------------------ @@ -682,19 +867,22 @@ void mesh_t::SEMFEMInterpMatrixTri2D(int _N, // Journal of engineering mathematics, 56(3), 247-262. // ------------------------------------------------------------------------ -void mesh_t::Warpfactor(int _N, int Npoints, dfloat *_r, dfloat *warp) { +void mesh_t::Warpfactor(const int _N, + const memory _r, + memory warp) { // Compute scaled warp function at order N // based on rout interpolation nodes + const int Npoints = _r.length(); // Compute GLL and equidistant node distribution - dfloat *GLLr = (dfloat *) malloc((_N+1)*sizeof(dfloat)); - dfloat *req = (dfloat *) malloc((_N+1)*sizeof(dfloat)); + memory GLLr; + memory req; JacobiGLL(_N, GLLr); EquispacedNodes1D(_N, req); // Make interpolation from req to r - dfloat *I = (dfloat*) malloc((_N+1)*Npoints*sizeof(dfloat)); - InterpolationMatrix1D(_N, _N+1, req, Npoints, _r, I); + memory I; + InterpolationMatrix1D(_N, req, _r, I); // Compute warp factor for (int n=0;n x, + const memory y, + memory r, + memory s) { + const int Npoints = x.length(); + for (int n=0;n _r, + memory _s, + const dfloat alphaIn){ const dfloat alpopt[15] = {0.0000, 0.0000, 1.4152, 0.1001, 0.2751, 0.9800, 1.0999, 1.2832, 1.3648, 1.4773, 1.4959, 1.5743, 1.5770, 1.6223, 1.6258}; + const int _Npoints = _r.length(); + dfloat alpha; if (alphaIn==-1) { if (_N<16) { @@ -740,16 +936,16 @@ void mesh_t::WarpBlendTransformTri2D(int _N, int _Npoints, dfloat *_r, dfloat *_ } // Convert r s coordinates to points in equilateral triangle - dfloat *L1 = (dfloat*) malloc(_Npoints*sizeof(dfloat)); - dfloat *L2 = (dfloat*) malloc(_Npoints*sizeof(dfloat)); - dfloat *L3 = (dfloat*) malloc(_Npoints*sizeof(dfloat)); + memory L1(_Npoints); + memory L2(_Npoints); + memory L3(_Npoints); - dfloat *dL32 = (dfloat*) malloc(_Npoints*sizeof(dfloat)); - dfloat *dL13 = (dfloat*) malloc(_Npoints*sizeof(dfloat)); - dfloat *dL21 = (dfloat*) malloc(_Npoints*sizeof(dfloat)); + memory dL32(_Npoints); + memory dL13(_Npoints); + memory dL21(_Npoints); - dfloat *_x = (dfloat*) malloc(_Npoints*sizeof(dfloat)); - dfloat *_y = (dfloat*) malloc(_Npoints*sizeof(dfloat)); + memory _x(_Npoints); + memory _y(_Npoints); for (int n=0;n<_Npoints;n++) { L1[n] = 0.5*(1.+_s[n]); @@ -763,13 +959,13 @@ void mesh_t::WarpBlendTransformTri2D(int _N, int _Npoints, dfloat *_r, dfloat *_ _x[n] = -L2[n]+L3[n]; _y[n] = (-L2[n]-L3[n]+2.*L1[n])/sqrt(3.0); } - dfloat *warpf1 = (dfloat*) malloc(_Npoints*sizeof(dfloat)); - dfloat *warpf2 = (dfloat*) malloc(_Npoints*sizeof(dfloat)); - dfloat *warpf3 = (dfloat*) malloc(_Npoints*sizeof(dfloat)); + memory warpf1(_Npoints); + memory warpf2(_Npoints); + memory warpf3(_Npoints); - Warpfactor(_N, _Npoints, dL32, warpf1); - Warpfactor(_N, _Npoints, dL13, warpf2); - Warpfactor(_N, _Npoints, dL21, warpf3); + Warpfactor(_N, dL32, warpf1); + Warpfactor(_N, dL13, warpf2); + Warpfactor(_N, dL21, warpf3); for (int n=0;n<_Npoints;n++) { dfloat blend1 = 4.0*L2[n]*L3[n]; @@ -784,12 +980,7 @@ void mesh_t::WarpBlendTransformTri2D(int _N, int _Npoints, dfloat *_r, dfloat *_ _y[n] += 0.*warp1 + sin(2.*M_PI/3.)*warp2 + sin(4.*M_PI/3.)*warp3; } - xytors(_Npoints, _x, _y, _r, _s); - - free(L1); free(L2); free(L3); - free(dL32); free(dL21); free(dL13); - free(warpf1); free(warpf2); free(warpf3); - free(_x); free(_y); + xytors(_x, _y, _r, _s); } // ------------------------------------------------------------------------ @@ -998,18 +1189,20 @@ static const dfloat cubTriR50[453] = {-4.872882732304178e-01,-2.542345353916386e static const dfloat cubTriS50[453] = {-4.872882732304183e-01,-4.872882732304183e-01,-2.542345353916348e-02,-9.981550877878594e-01,-1.133538262833160e-01, 1.115089140711756e-01,-9.981550877878594e-01, 1.115089140711755e-01,-1.133538262833150e-01,-9.551240174753737e-01,-9.481262122484194e-01, 9.032502297237955e-01,-9.551240174753737e-01, 9.032502297237955e-01,-9.481262122484182e-01,-5.544185434125337e-01,-5.197117894734606e-01, 7.413033288599558e-02,-5.544185434125337e-01, 7.413033288599546e-02,-5.197117894734606e-01,-9.537595720595068e-01,-7.308947639467150e-01, 6.846543360062232e-01,-9.537595720595068e-01, 6.846543360062232e-01,-7.308947639467139e-01,-9.982864404099823e-01,-9.099288550653375e-01, 9.082152954753198e-01,-9.982864404099823e-01, 9.082152954753198e-01,-9.099288550653363e-01,-9.903827246782556e-01,-1.825708668644565e-01, 1.729535915427133e-01,-9.903827246782556e-01, 1.729535915427121e-01,-1.825708668644553e-01,-9.530139752811234e-01,-6.570730845200339e-01, 6.100870598011572e-01,-9.530139752811234e-01, 6.100870598011572e-01,-6.570730845200325e-01,-9.967388437157084e-01,-9.967388437157074e-01, 9.934776874314213e-01,-8.132067057412727e-01,-5.797881505899334e-01, 3.929948563312072e-01,-8.132067057412727e-01, 3.929948563312072e-01,-5.797881505899334e-01,-9.164067266977042e-01,-7.214063597628300e-01, 6.378130864605343e-01,-9.164067266977042e-01, 6.378130864605343e-01,-7.214063597628289e-01,-9.012853762142768e-01,-8.592358921744816e-01, 7.605212683887597e-01,-9.012853762142771e-01, 7.605212683887597e-01,-8.592358921744805e-01,-9.613074769938050e-01,-1.934626150309704e-02,-1.934626150309704e-02,-6.943717859756775e-01,-2.547870236870117e-01,-5.084119033730944e-02,-6.943717859756775e-01,-5.084119033730944e-02,-2.547870236870114e-01,-8.697691401387569e-01,-7.041685654447123e-01, 5.739377055834691e-01,-8.697691401387569e-01, 5.739377055834691e-01,-7.041685654447112e-01,-9.530757136479578e-01,-5.719507128367128e-01, 5.250264264846709e-01,-9.530757136479578e-01, 5.250264264846709e-01,-5.719507128367118e-01,-8.959354435311784e-01,-3.394651938837119e-01, 2.354006374148909e-01,-8.959354435311784e-01, 2.354006374148909e-01,-3.394651938837116e-01,-9.970109617889348e-01,-2.600302729111919e-01, 2.570412347001281e-01,-9.970109617889348e-01, 2.570412347001280e-01,-2.600302729111914e-01,-9.759125061304149e-01,-8.649768619133283e-01, 8.408893680437466e-01,-9.759125061304149e-01, 8.408893680437466e-01,-8.649768619133272e-01,-8.866304289130773e-01,-1.258023176398068e-01, 1.243274655288423e-02,-8.866304289130773e-01, 1.243274655288418e-02,-1.258023176398056e-01,-6.903486740300824e-01,-1.548256629849583e-01,-1.548256629849583e-01,-7.702797159103688e-01,-1.709755576844286e-01,-5.874472640520140e-02,-7.702797159103689e-01,-5.874472640520140e-02,-1.709755576844287e-01,-9.374703276265821e-01,-2.752389754621450e-01, 2.127093030887279e-01,-9.374703276265821e-01, 2.127093030887279e-01,-2.752389754621447e-01,-7.955067197089168e-01,-3.754974289775928e-01, 1.710041486865101e-01,-7.955067197089168e-01, 1.710041486865101e-01,-3.754974289775926e-01,-9.223027057361204e-01,-3.884864713193931e-02,-3.884864713193931e-02,-9.033013962538536e-01,-2.146213565926236e-01, 1.179227528464785e-01,-9.033013962538536e-01, 1.179227528464785e-01,-2.146213565926236e-01,-9.800408461407152e-01,-9.478722916754247e-01, 9.279131378161389e-01,-9.800408461407152e-01, 9.279131378161389e-01,-9.478722916754235e-01,-8.321741184601432e-01,-2.032180083100886e-01, 3.539212677023307e-02,-8.321741184601432e-01, 3.539212677023307e-02,-2.032180083100886e-01,-9.865437365974319e-01,-9.082392173581701e-01, 8.947829539556066e-01,-9.865437365974319e-01, 8.947829539556066e-01,-9.082392173581691e-01,-6.453839503352626e-01,-4.978033842696664e-01, 1.431873346049300e-01,-6.453839503352626e-01, 1.431873346049300e-01,-4.978033842696664e-01,-9.146318245298287e-01,-6.337261321307773e-01, 5.483579566606072e-01,-9.146318245298287e-01, 5.483579566606072e-01,-6.337261321307773e-01,-8.512144639199243e-01,-2.919073934432752e-01, 1.431218573632000e-01,-8.512144639199243e-01, 1.431218573632000e-01,-2.919073934432750e-01,-5.128621044059271e-01,-4.015780257796654e-01,-8.555986981440711e-02,-5.128621044059271e-01,-8.555986981440711e-02,-4.015780257796653e-01,-8.472168605080239e-01,-4.183215340291764e-01, 2.655383945372011e-01,-8.472168605080239e-01, 2.655383945372011e-01,-4.183215340291763e-01,-7.515508206622273e-01,-6.277077362933863e-01, 3.792585569556139e-01,-7.515508206622273e-01, 3.792585569556139e-01,-6.277077362933852e-01,-9.804419832588688e-01,-7.327456500575513e-01, 7.131876333164204e-01,-9.804419832588688e-01, 7.131876333164204e-01,-7.327456500575501e-01,-9.441135959036719e-01,-8.620473297035763e-01, 8.061609256072495e-01,-9.441135959036719e-01, 8.061609256072495e-01,-8.620473297035763e-01,-9.503834629782960e-01,-4.770179614603841e-01, 4.274014244386812e-01,-9.503834629782960e-01, 4.274014244386800e-01,-4.770179614603829e-01,-9.577479991442321e-01,-9.120410878817669e-01, 8.697890870259968e-01,-9.577479991442321e-01, 8.697890870259968e-01,-9.120410878817656e-01,-8.637002594919088e-01,-5.060855365853788e-01, 3.697857960772886e-01,-8.637002594919088e-01, 3.697857960772886e-01,-5.060855365853777e-01,-8.624093861991303e-01,-6.167853940403172e-01, 4.791947802394489e-01,-8.624093861991303e-01, 4.791947802394489e-01,-6.167853940403172e-01,-6.872930457599575e-01,-3.652080353829373e-01, 5.250108114289476e-02,-6.872930457599575e-01, 5.250108114289476e-02,-3.652080353829372e-01,-9.684224577286503e-01,-2.123213874003148e-01, 1.807438451289652e-01,-9.684224577286503e-01, 1.807438451289652e-01,-2.123213874003138e-01,-9.531071840655393e-01,-3.637106635744283e-01, 3.168178476399691e-01,-9.531071840655393e-01, 3.168178476399681e-01,-3.637106635744282e-01,-7.281403730572283e-01,-4.392710248840062e-01, 1.674113979412348e-01,-7.281403730572283e-01, 1.674113979412347e-01,-4.392710248840059e-01,-9.106088059739158e-01,-5.400485398316867e-01, 4.506573458056035e-01,-9.106088059739158e-01, 4.506573458056035e-01,-5.400485398316867e-01,-5.078874986216810e-01,-3.010237499257296e-01,-1.910887514525894e-01,-5.078874986216810e-01,-1.910887514525894e-01,-3.010237499257296e-01,-9.784730545245666e-01,-8.071832841875552e-01, 7.856563387121220e-01,-9.784730545245666e-01, 7.856563387121220e-01,-8.071832841875541e-01,-9.464044605480616e-01,-7.993239840668014e-01, 7.457284446148619e-01,-9.464044605480616e-01, 7.457284446148619e-01,-7.993239840668002e-01,-6.001966867209210e-01,-4.211243138320420e-01, 2.132100055296415e-02,-6.001966867209210e-01, 2.132100055296426e-02,-4.211243138320419e-01,-8.383470872976471e-01,-8.082645635117647e-02,-8.082645635117519e-02,-9.806804143047906e-01,-6.427670920898165e-01, 6.234475063946073e-01,-9.806804143047906e-01, 6.234475063946073e-01,-6.427670920898154e-01,-7.733335317298518e-01,-7.733335317298506e-01, 5.466670634597035e-01,-9.127674219487553e-01,-4.276921547374961e-01, 3.404595766862517e-01,-9.127674219487553e-01, 3.404595766862516e-01,-4.276921547374953e-01,-9.807324089004609e-01,-5.413422080782049e-01, 5.220746169786655e-01,-9.807324089004609e-01, 5.220746169786655e-01,-5.413422080782035e-01,-7.670859956964946e-01,-2.915742076670567e-01, 5.866020336355282e-02,-7.670859956964946e-01, 5.866020336355166e-02,-2.915742076670563e-01,-9.810462457017906e-01,-3.101110169100291e-01, 2.911572626118211e-01,-9.810462457017906e-01, 2.911572626118211e-01,-3.101110169100285e-01,-9.440667674080058e-01,-1.366535383528923e-01, 8.072030576089800e-02,-9.440667674080058e-01, 8.072030576089800e-02,-1.366535383528912e-01,-7.054683116064060e-01,-5.584055524963210e-01, 2.638738641027268e-01,-7.054683116064060e-01, 2.638738641027268e-01,-5.584055524963196e-01,-9.781219738706742e-01,-9.781219738706731e-01, 9.562439477413485e-01,-9.962588661779311e-01,-6.936543298046310e-01, 6.899131959825635e-01,-9.962588661779311e-01, 6.899131959825635e-01,-6.936543298046310e-01,-8.427072760972010e-01,-8.427072760971998e-01, 6.854145521944008e-01,-9.958657196429472e-01,-9.816027664683789e-01, 9.774684861113283e-01,-9.958657196429472e-01, 9.774684861113283e-01,-9.816027664683776e-01,-9.959872383159527e-01,-9.529095314851794e-01, 9.488967698011345e-01,-9.959872383159527e-01, 9.488967698011345e-01,-9.529095314851782e-01,-6.068160154409350e-01,-3.085430405630674e-01,-8.464094399599648e-02,-6.068160154409350e-01,-8.464094399599648e-02,-3.085430405630671e-01,-7.850422677955388e-01,-4.996613823756856e-01, 2.847036501712243e-01,-7.850422677955388e-01, 2.847036501712243e-01,-4.996613823756844e-01,-9.962763812934690e-01,-4.889002747967343e-01, 4.851766560902033e-01,-9.962763812934691e-01, 4.851766560902033e-01,-4.889002747967332e-01,-8.995854148168808e-01,-7.934131294747505e-01, 6.929985442916313e-01,-8.995854148168808e-01, 6.929985442916313e-01,-7.934131294747493e-01,-6.594822352717378e-01,-6.594822352717367e-01, 3.189644705434744e-01,-8.418024664862509e-01,-7.730344416189197e-01, 6.148369081051707e-01,-8.418024664862509e-01, 6.148369081051707e-01,-7.730344416189197e-01,-9.963422606399693e-01,-5.959969617208548e-01, 5.923392223608240e-01,-9.963422606399693e-01, 5.923392223608240e-01,-5.959969617208537e-01,-6.007330813263658e-01,-6.007330813263658e-01, 2.014661626527315e-01,-8.050030786501207e-01,-6.891854133166682e-01, 4.941884919667888e-01,-8.050030786501207e-01, 4.941884919667888e-01,-6.891854133166671e-01,-9.794579145241266e-01,-4.325337483001823e-01, 4.119916628243097e-01,-9.794579145241266e-01, 4.119916628243097e-01,-4.325337483001819e-01,-6.026757301953206e-01,-1.986621349023401e-01,-1.986621349023392e-01,-7.187722518872315e-01,-7.187722518872315e-01, 4.375445037744630e-01,-9.961098664509530e-01,-7.797247785760779e-01, 7.758346450270308e-01,-9.961098664509530e-01, 7.758346450270308e-01,-7.797247785760768e-01,-4.047614577816621e-01,-2.976192711091686e-01,-2.976192711091686e-01,-9.154992749697238e-01,-9.154992749697237e-01, 8.309985499394521e-01,-9.959745701414753e-01,-3.767460272074153e-01, 3.727205973488916e-01,-9.959745701414753e-01, 3.727205973488916e-01,-3.767460272074148e-01,-9.947355932482315e-01,-2.632203375884212e-03,-2.632203375883158e-03,-9.802403515707592e-01,-8.786874490008606e-02, 6.810909647084645e-02,-9.802403515707592e-01, 6.810909647084645e-02,-8.786874490008606e-02,-4.070402649740579e-01,-4.070402649740577e-01,-1.859194700518831e-01,-9.955476949059794e-01,-8.518818506402979e-01, 8.474295455462760e-01,-9.955476949059794e-01, 8.474295455462760e-01,-8.518818506402979e-01}; static const dfloat cubTriW50[453] = { 6.657390349426455e-03, 6.657390349426455e-03, 6.657390349426455e-03, 6.946563189849545e-04, 6.946563189849545e-04, 6.946563189849545e-04, 6.946563189849545e-04, 6.946563189849545e-04, 6.946563189849545e-04, 6.001002038198160e-04, 6.001002038198160e-04, 6.001002038198160e-04, 6.001002038198160e-04, 6.001002038198160e-04, 6.001002038198160e-04, 5.460804118459578e-03, 5.460804118459578e-03, 5.460804118459578e-03, 5.460804118459578e-03, 5.460804118459578e-03, 5.460804118459578e-03, 2.265942903569247e-03, 2.265942903569247e-03, 2.265942903569247e-03, 2.265942903569247e-03, 2.265942903569247e-03, 2.265942903569247e-03, 2.730436819934103e-04, 2.730436819934103e-04, 2.730436819934103e-04, 2.730436819934103e-04, 2.730436819934103e-04, 2.730436819934103e-04, 1.538752133568642e-03, 1.538752133568642e-03, 1.538752133568642e-03, 1.538752133568642e-03, 1.538752133568642e-03, 1.538752133568642e-03, 2.593946389224276e-03, 2.593946389224276e-03, 2.593946389224276e-03, 2.593946389224276e-03, 2.593946389224276e-03, 2.593946389224276e-03, 7.318644849273957e-05, 7.318644849273957e-05, 7.318644849273957e-05, 5.611710324220868e-03, 5.611710324220868e-03, 5.611710324220868e-03, 5.611710324220868e-03, 5.611710324220868e-03, 5.611710324220868e-03, 3.417648445986814e-03, 3.417648445986814e-03, 3.417648445986814e-03, 3.417648445986814e-03, 3.417648445986814e-03, 3.417648445986814e-03, 2.967053886437357e-03, 2.967053886437357e-03, 2.967053886437357e-03, 2.967053886437357e-03, 2.967053886437357e-03, 2.967053886437357e-03, 3.302792714558273e-03, 3.302792714558273e-03, 3.302792714558273e-03, 8.508388866481208e-03, 8.508388866481208e-03, 8.508388866481208e-03, 8.508388866481208e-03, 8.508388866481208e-03, 8.508388866481208e-03, 4.395599173436755e-03, 4.395599173436755e-03, 4.395599173436755e-03, 4.395599173436755e-03, 4.395599173436755e-03, 4.395599173436755e-03, 3.097163895783280e-03, 3.097163895783280e-03, 3.097163895783280e-03, 3.097163895783280e-03, 3.097163895783280e-03, 3.097163895783280e-03, 5.206561060307146e-03, 5.206561060307146e-03, 5.206561060307146e-03, 5.206561060307146e-03, 5.206561060307146e-03, 5.206561060307146e-03, 9.323405282065094e-04, 9.323405282065094e-04, 9.323405282065094e-04, 9.323405282065094e-04, 9.323405282065094e-04, 9.323405282065094e-04, 1.315154312747027e-03, 1.315154312747027e-03, 1.315154312747027e-03, 1.315154312747027e-03, 1.315154312747027e-03, 1.315154312747027e-03, 5.572930818998745e-03, 5.572930818998745e-03, 5.572930818998745e-03, 5.572930818998745e-03, 5.572930818998745e-03, 5.572930818998745e-03, 8.518193356382943e-03, 8.518193356382943e-03, 8.518193356382943e-03, 8.259313906283377e-03, 8.259313906283377e-03, 8.259313906283377e-03, 8.259313906283377e-03, 8.259313906283377e-03, 8.259313906283377e-03, 4.308565834654927e-03, 4.308565834654927e-03, 4.308565834654927e-03, 4.308565834654927e-03, 4.308565834654927e-03, 4.308565834654927e-03, 6.951904403342183e-03, 6.951904403342183e-03, 6.951904403342183e-03, 6.951904403342183e-03, 6.951904403342183e-03, 6.951904403342183e-03, 4.848559914819788e-03, 4.848559914819788e-03, 4.848559914819788e-03, 5.413228508794187e-03, 5.413228508794187e-03, 5.413228508794187e-03, 5.413228508794187e-03, 5.413228508794187e-03, 5.413228508794187e-03, 7.715585739642544e-04, 7.715585739642544e-04, 7.715585739642544e-04, 7.715585739642544e-04, 7.715585739642544e-04, 7.715585739642544e-04, 6.879332858451239e-03, 6.879332858451239e-03, 6.879332858451239e-03, 6.879332858451239e-03, 6.879332858451239e-03, 6.879332858451239e-03, 8.760538741807422e-04, 8.760538741807422e-04, 8.760538741807422e-04, 8.760538741807422e-04, 8.760538741807422e-04, 8.760538741807422e-04, 8.812029055620537e-03, 8.812029055620537e-03, 8.812029055620537e-03, 8.812029055620537e-03, 8.812029055620537e-03, 8.812029055620537e-03, 4.035001616910756e-03, 4.035001616910756e-03, 4.035001616910756e-03, 4.035001616910756e-03, 4.035001616910756e-03, 4.035001616910756e-03, 6.476557209099314e-03, 6.476557209099314e-03, 6.476557209099314e-03, 6.476557209099314e-03, 6.476557209099314e-03, 6.476557209099314e-03, 9.332421598387951e-03, 9.332421598387951e-03, 9.332421598387951e-03, 9.332421598387951e-03, 9.332421598387951e-03, 9.332421598387951e-03, 6.318451635509753e-03, 6.318451635509753e-03, 6.318451635509753e-03, 6.318451635509753e-03, 6.318451635509753e-03, 6.318451635509753e-03, 6.947169036892493e-03, 6.947169036892493e-03, 6.947169036892493e-03, 6.947169036892493e-03, 6.947169036892493e-03, 6.947169036892493e-03, 1.790785760819592e-03, 1.790785760819592e-03, 1.790785760819592e-03, 1.790785760819592e-03, 1.790785760819592e-03, 1.790785760819592e-03, 2.160956967250754e-03, 2.160956967250754e-03, 2.160956967250754e-03, 2.160956967250754e-03, 2.160956967250754e-03, 2.160956967250754e-03, 3.625948898219889e-03, 3.625948898219889e-03, 3.625948898219889e-03, 3.625948898219889e-03, 3.625948898219889e-03, 3.625948898219889e-03, 1.530314260972668e-03, 1.530314260972668e-03, 1.530314260972668e-03, 1.530314260972668e-03, 1.530314260972668e-03, 1.530314260972668e-03, 5.715363408558084e-03, 5.715363408558084e-03, 5.715363408558084e-03, 5.715363408558084e-03, 5.715363408558084e-03, 5.715363408558084e-03, 5.311624181103388e-03, 5.311624181103388e-03, 5.311624181103388e-03, 5.311624181103388e-03, 5.311624181103388e-03, 5.311624181103388e-03, 8.954875242212912e-03, 8.954875242212912e-03, 8.954875242212912e-03, 8.954875242212912e-03, 8.954875242212912e-03, 8.954875242212912e-03, 3.307034522546869e-03, 3.307034522546869e-03, 3.307034522546869e-03, 3.307034522546869e-03, 3.307034522546869e-03, 3.307034522546869e-03, 3.889979271429086e-03, 3.889979271429086e-03, 3.889979271429086e-03, 3.889979271429086e-03, 3.889979271429086e-03, 3.889979271429086e-03, 8.109035161435074e-03, 8.109035161435074e-03, 8.109035161435074e-03, 8.109035161435074e-03, 8.109035161435074e-03, 8.109035161435074e-03, 4.694394019735870e-03, 4.694394019735870e-03, 4.694394019735870e-03, 4.694394019735870e-03, 4.694394019735870e-03, 4.694394019735870e-03, 1.083959336378307e-02, 1.083959336378307e-02, 1.083959336378307e-02, 1.083959336378307e-02, 1.083959336378307e-02, 1.083959336378307e-02, 1.642838231112343e-03, 1.642838231112343e-03, 1.642838231112343e-03, 1.642838231112343e-03, 1.642838231112343e-03, 1.642838231112343e-03, 2.666246149563934e-03, 2.666246149563934e-03, 2.666246149563934e-03, 2.666246149563934e-03, 2.666246149563934e-03, 2.666246149563934e-03, 9.345204607681365e-03, 9.345204607681365e-03, 9.345204607681365e-03, 9.345204607681365e-03, 9.345204607681365e-03, 9.345204607681365e-03, 7.165422422124212e-03, 7.165422422124212e-03, 7.165422422124212e-03, 2.094698488702054e-03, 2.094698488702054e-03, 2.094698488702054e-03, 2.094698488702054e-03, 2.094698488702054e-03, 2.094698488702054e-03, 5.559197600889325e-03, 5.559197600889325e-03, 5.559197600889325e-03, 5.114025641788271e-03, 5.114025641788271e-03, 5.114025641788271e-03, 5.114025641788271e-03, 5.114025641788271e-03, 5.114025641788271e-03, 2.307531856262896e-03, 2.307531856262896e-03, 2.307531856262896e-03, 2.307531856262896e-03, 2.307531856262896e-03, 2.307531856262896e-03, 8.447515166178116e-03, 8.447515166178116e-03, 8.447515166178116e-03, 8.447515166178116e-03, 8.447515166178116e-03, 8.447515166178116e-03, 2.674835029006195e-03, 2.674835029006195e-03, 2.674835029006195e-03, 2.674835029006195e-03, 2.674835029006195e-03, 2.674835029006195e-03, 4.762846609148414e-03, 4.762846609148414e-03, 4.762846609148414e-03, 4.762846609148414e-03, 4.762846609148414e-03, 4.762846609148414e-03, 8.565704904038088e-03, 8.565704904038088e-03, 8.565704904038088e-03, 8.565704904038088e-03, 8.565704904038088e-03, 8.565704904038088e-03, 5.999043738296138e-04, 5.999043738296138e-04, 5.999043738296138e-04, 8.813879170752840e-04, 8.813879170752840e-04, 8.813879170752840e-04, 8.813879170752840e-04, 8.813879170752840e-04, 8.813879170752840e-04, 4.135743334159545e-03, 4.135743334159545e-03, 4.135743334159545e-03, 2.312905155015999e-04, 2.312905155015999e-04, 2.312905155015999e-04, 2.312905155015999e-04, 2.312905155015999e-04, 2.312905155015999e-04, 3.648502036079644e-04, 3.648502036079644e-04, 3.648502036079644e-04, 3.648502036079644e-04, 3.648502036079644e-04, 3.648502036079644e-04, 1.027023169453754e-02, 1.027023169453754e-02, 1.027023169453754e-02, 1.027023169453754e-02, 1.027023169453754e-02, 1.027023169453754e-02, 7.810111420309305e-03, 7.810111420309305e-03, 7.810111420309305e-03, 7.810111420309305e-03, 7.810111420309305e-03, 7.810111420309305e-03, 1.056435119667814e-03, 1.056435119667814e-03, 1.056435119667814e-03, 1.056435119667814e-03, 1.056435119667814e-03, 1.056435119667814e-03, 3.798126951694121e-03, 3.798126951694121e-03, 3.798126951694121e-03, 3.798126951694121e-03, 3.798126951694121e-03, 3.798126951694121e-03, 8.055421977036622e-03, 8.055421977036622e-03, 8.055421977036622e-03, 4.668118624327395e-03, 4.668118624327395e-03, 4.668118624327395e-03, 4.668118624327395e-03, 4.668118624327395e-03, 4.668118624327395e-03, 9.653413559593795e-04, 9.653413559593795e-04, 9.653413559593795e-04, 9.653413559593795e-04, 9.653413559593795e-04, 9.653413559593795e-04, 9.141347778252359e-03, 9.141347778252359e-03, 9.141347778252359e-03, 6.498038898642801e-03, 6.498038898642801e-03, 6.498038898642801e-03, 6.498038898642801e-03, 6.498038898642801e-03, 6.498038898642801e-03, 2.639533833535306e-03, 2.639533833535306e-03, 2.639533833535306e-03, 2.639533833535306e-03, 2.639533833535306e-03, 2.639533833535306e-03, 1.033071185490775e-02, 1.033071185490775e-02, 1.033071185490775e-02, 6.970526706905318e-03, 6.970526706905318e-03, 6.970526706905318e-03, 7.934585220888267e-04, 7.934585220888267e-04, 7.934585220888267e-04, 7.934585220888267e-04, 7.934585220888267e-04, 7.934585220888267e-04, 1.171362890783687e-02, 1.171362890783687e-02, 1.171362890783687e-02, 2.397897567676900e-03, 2.397897567676900e-03, 2.397897567676900e-03, 1.171186882042265e-03, 1.171186882042265e-03, 1.171186882042265e-03, 1.171186882042265e-03, 1.171186882042265e-03, 1.171186882042265e-03, 1.498644108210421e-03, 1.498644108210421e-03, 1.498644108210421e-03, 2.801549769502369e-03, 2.801549769502369e-03, 2.801549769502369e-03, 2.801549769502369e-03, 2.801549769502369e-03, 2.801549769502369e-03, 1.154727780718457e-02, 1.154727780718457e-02, 1.154727780718457e-02, 7.410110976283445e-04, 7.410110976283445e-04, 7.410110976283445e-04, 7.410110976283445e-04, 7.410110976283445e-04, 7.410110976283445e-04}; -void mesh_t::CubatureNodesTri2D(int cubTriN, int *_cubNp, dfloat **cubTrir, dfloat **cubTris, dfloat **cubTriw){ +void mesh_t::CubatureNodesTri2D(const int cubTriN, + int& _cubNp, + memory& cubTrir, + memory& cubTris, + memory& cubTriw){ - if (cubTriN>50) - LIBP_ABORT(string("Requested Cubature order unavailable.")) + LIBP_ABORT("Requested Cubature order unavailable.", + cubTriN>50); - int cubTriNp = cubTriNps[cubTriN-1]; + _cubNp = cubTriNps[cubTriN-1]; - *_cubNp = cubTriNp; - - *cubTrir = (dfloat*) calloc(cubTriNp, sizeof(dfloat)); - *cubTris = (dfloat*) calloc(cubTriNp, sizeof(dfloat)); - *cubTriw = (dfloat*) calloc(cubTriNp, sizeof(dfloat)); + cubTrir.malloc(_cubNp); + cubTris.malloc(_cubNp); + cubTriw.malloc(_cubNp); const dfloat *cubTriR, *cubTriS, *cubTriW; switch(cubTriN){ @@ -1064,12 +1257,14 @@ void mesh_t::CubatureNodesTri2D(int cubTriN, int *_cubNp, dfloat **cubTrir, dflo case 49: cubTriR = cubTriR49; cubTriS = cubTriS49; cubTriW = cubTriW49; break; case 50: cubTriR = cubTriR50; cubTriS = cubTriS50; cubTriW = cubTriW50; break; default: - LIBP_ABORT(string("Requested Cubature order unavailable.")) + LIBP_FORCE_ABORT("Requested Cubature order unavailable."); } - for(int n=0;n +using __gnu_parallel::sort; +#else +using std::sort; +#endif + +namespace libp { + // structure used to encode vertices that make // each face, the element/face indices, and // the neighbor element/face indices (if any) -typedef struct{ - - dlong element; - int face; - - dlong elementNeighbor; // neighbor element - int faceNeighbor; // neighbor face - - hlong v[4]; +typedef struct { + hlong v[4]; // vertices on face + dlong element, elementN; + int face, faceN; // face info + int rank, rankN; // N for neighbor face info }face_t; -/* routine to find EToE (Element To Element) - and EToF (Element To Local Face) connectivity arrays */ +// mesh is the local partition void mesh_t::Connect(){ + EToE.malloc(Nelements*Nfaces); + EToF.malloc(Nelements*Nfaces); + EToP.malloc(Nelements*Nfaces); + + /********************** + * Local Connectivity + **********************/ + /* build list of faces */ - face_t *faces = - (face_t*) calloc(Nelements*Nfaces, sizeof(face_t)); + memory faces(Nelements*Nfaces); - dlong cnt = 0; + #pragma omp parallel for collapse(2) for(dlong e=0;e()); - faces[cnt].element = e; - faces[cnt].face = f; - - faces[cnt].elementNeighbor= -1; - faces[cnt].faceNeighbor = -1; + faces[id].element = e; + faces[id].face = f; - ++cnt; + faces[id].elementN= -1; + faces[id].faceN = -1; } } /* sort faces by their vertex number pairs */ - std::sort(faces, faces+Nelements*Nfaces, - [&](const face_t& a, const face_t& b) { - return std::lexicographical_compare(a.v, a.v+NfaceVertices, - b.v, b.v+NfaceVertices); - }); + sort(faces.ptr(), faces.ptr()+Nelements*Nfaces, + [&](const face_t& a, const face_t& b) { + return std::lexicographical_compare(a.v, a.v+NfaceVertices, + b.v, b.v+NfaceVertices); + }); /* scan through sorted face lists looking for adjacent faces that have the same vertex ids */ - for(cnt=0;cnt b.element) return false; + sort(faces.ptr(), faces.ptr()+Nelements*Nfaces, + [](const face_t& a, const face_t& b) { + if(a.element < b.element) return true; + if(a.element > b.element) return false; - return (a.face < b.face); - }); + return (a.face < b.face); + }); /* extract the element to element and element to face connectivity */ - EToE = (dlong*) calloc(Nelements*Nfaces, sizeof(dlong)); - EToF = (int*) calloc(Nelements*Nfaces, sizeof(int )); + #pragma omp parallel for collapse(2) + for(dlong e=0;e Nsend(size, 0); + memory Nrecv(size, 0); + memory sendOffsets(size, 0); + memory recvOffsets(size, 0); + + // WARNING: In some corner cases, the number of faces to send may overrun int storage + int allNsend = 0; for(dlong e=0;e sendFaces(allNsend); + + // pack face data + for(dlong e=0;e()); + + sendFaces[id].rank = rank; + + sendFaces[id].elementN = -1; + sendFaces[id].faceN = -1; + sendFaces[id].rankN = -1; + + ++Nsend[destRank]; + } + } + } + + // exchange byte counts + comm.Alltoall(Nsend, Nrecv); + + // count incoming faces + int allNrecv = 0; + for(int rr=0;rr recvFaces(allNrecv); + + // exchange parallel faces + comm.Alltoallv(sendFaces, Nsend, sendOffsets, + recvFaces, Nrecv, recvOffsets); + + // local sort allNrecv received faces + sort(recvFaces.ptr(), recvFaces.ptr()+allNrecv, + [&](const face_t& a, const face_t& b) { + return std::lexicographical_compare(a.v, a.v+NfaceVertices, + b.v, b.v+NfaceVertices); + }); + + // find matches + #pragma omp parallel for + for(int n=0;n b.rank) return false; + + if(a.element < b.element) return true; + if(a.element > b.element) return false; + + return (a.face < b.face); + }); + + // send faces back from whence they came + comm.Alltoallv(recvFaces, Nrecv, recvOffsets, + sendFaces, Nsend, sendOffsets); + + // extract connectivity info + #pragma omp parallel for + for(dlong n=0;n=0 && f>=0 && eN>=0 && fN>=0){ + EToE[e*Nfaces+f] = eN; + EToF[e*Nfaces+f] = fN; + EToP[e*Nfaces+f] = rN; + } + } + + //record the number of elements in the whole mesh + NelementsGlobal = Nelements; + comm.Allreduce(NelementsGlobal); } + +} //namespace libp diff --git a/libs/mesh/meshConnectBoundary.cpp b/libs/mesh/meshConnectBoundary.cpp index 97207dad7..29bd5c231 100644 --- a/libs/mesh/meshConnectBoundary.cpp +++ b/libs/mesh/meshConnectBoundary.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,6 +26,15 @@ SOFTWARE. #include "mesh.hpp" +#ifdef GLIBCXX_PARALLEL +#include +using __gnu_parallel::sort; +#else +using std::sort; +#endif + +namespace libp { + // structure used to encode vertices that make // each face, the element/face indices, and // the neighbor element/face indices (if any) @@ -59,8 +68,7 @@ void mesh_t::ConnectBoundary(){ #endif /* build list of boundary faces */ - boundaryFace_t *boundaryFaces = (boundaryFace_t*) calloc(bcnt+NboundaryFaces, - sizeof(boundaryFace_t)); + memory boundaryFaces(bcnt+NboundaryFaces); bcnt = 0; // reset counter for(dlong e=0;e(EToB); + #if 0 int cnt = 0; for(int e=0;e R; + + switch (elementType) { + case Mesh::TRIANGLES: + FaceNodeMatchingTri2D(r, s, faceNodes, faceVertices, R); + break; + case Mesh::QUADRILATERALS: + FaceNodeMatchingQuad2D(r, s, faceNodes, faceVertices, R); + break; + case Mesh::TETRAHEDRA: + FaceNodeMatchingTet3D(r, s, t, faceNodes, faceVertices, R); + break; + case Mesh::HEXAHEDRA: + FaceNodeMatchingHex3D(r, s, t, faceNodes, faceVertices, R); + break; + } + + /* volume indices of the interior and exterior face nodes for each element */ + vmapM.malloc(Nfp*Nfaces*Nelements); + vmapP.malloc(Nfp*Nfaces*Nelements); + mapP.malloc(Nfp*Nfaces*Nelements); + + /* assume elements already connected */ + #pragma omp parallel for collapse(2) + for(dlong eM=0;eM(vmapM); + o_vmapP = platform.malloc(vmapP); + o_mapP = platform.malloc(mapP); +} + +} //namespace libp diff --git a/libs/mesh/meshConnectFaceNodes2D.cpp b/libs/mesh/meshConnectFaceNodes2D.cpp deleted file mode 100644 index 97f84b9f5..000000000 --- a/libs/mesh/meshConnectFaceNodes2D.cpp +++ /dev/null @@ -1,138 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#include "mesh.hpp" -#include "mesh/mesh2D.hpp" - -static int findBestMatch(dfloat x1, dfloat y1, - int Np2, int *nodeList, dfloat *x2, dfloat *y2, int *nP){ - - int matchIndex = nodeList[0]; - dfloat mindist2 = pow(x1-x2[nodeList[0]],2) + pow(y1-y2[nodeList[0]],2); - - *nP = 0; - for(int n=1;n1e-3) { - stringstream ss; - ss << "Bad match: x,y = " << x1 << ", " << y1 << "\n"; - LIBP_ABORT(ss.str()) - } - return matchIndex; -} - -// serial face-node to face-node connection -void mesh2D::ConnectFaceNodes(){ - - /* volume indices of the interior and exterior face nodes for each element */ - vmapM = (dlong*) calloc(Nfp*Nfaces*Nelements, sizeof(dlong)); - vmapP = (dlong*) calloc(Nfp*Nfaces*Nelements, sizeof(dlong)); - mapP = (dlong*) calloc(Nfp*Nfaces*Nelements, sizeof(dlong)); - - //check if we're connecting a periodic box mesh - int periodicFlag = 0; - if (settings.compareSetting("MESH FILE","BOX") && - settings.compareSetting("BOX BOUNDARY FLAG","-1")) - periodicFlag = 1; - - //box dimensions - dfloat DIMX, DIMY; - settings.getSetting("BOX DIMX", DIMX); - settings.getSetting("BOX DIMY", DIMY); - - //box is centered at the origin - DIMX /= 2.0; - DIMY /= 2.0; - - /* assume elements already connected */ - for(dlong e=0;e1e-4) right = false; - if (fabs(EX[vid]+DIMX)>1e-4) left = false; - if (fabs(EY[vid]-DIMY)>1e-4) top = false; - if (fabs(EY[vid]+DIMY)>1e-4) bottom = false; - } - - if (right) offsetX = -2.0*DIMX; - if (left) offsetX = 2.0*DIMX; - if (top) offsetY = -2.0*DIMY; - if (bottom) offsetY = 2.0*DIMY; - } - - /* for each node on this face find the neighbor node */ - for(int n=0;n1e-3) { - stringstream ss; - ss << "Bad match: x,y,z = " << x1 << ", " << y1 << ", " << z1 << "\n"; - LIBP_ABORT(ss.str()) - } - - return matchIndex; -} - - -// serial face-node to face-node connection -void mesh3D::ConnectFaceNodes(){ - - /* volume indices of the interior and exterior face nodes for each element */ - vmapM = (dlong*) calloc(Nfp*Nfaces*Nelements, sizeof(dlong)); - vmapP = (dlong*) calloc(Nfp*Nfaces*Nelements, sizeof(dlong)); - mapP = (dlong*) calloc(Nfp*Nfaces*Nelements, sizeof(dlong)); - - //check if we're connecting a periodic box mesh - int periodicFlag = 0; - if (settings.compareSetting("MESH FILE","BOX") && - settings.compareSetting("BOX BOUNDARY FLAG","-1")) - periodicFlag = 1; - - //box dimensions - dfloat DIMX, DIMY, DIMZ; - settings.getSetting("BOX DIMX", DIMX); - settings.getSetting("BOX DIMY", DIMY); - settings.getSetting("BOX DIMZ", DIMZ); - - //box is centered at the origin - DIMX /= 2.0; - DIMY /= 2.0; - DIMZ /= 2.0; - - /* assume elements already connected */ - for(dlong e=0;e1e-4) right = false; - if (fabs(EX[vid]+DIMX)>1e-4) left = false; - if (fabs(EY[vid]-DIMY)>1e-4) back = false; - if (fabs(EY[vid]+DIMY)>1e-4) front = false; - if (fabs(EZ[vid]-DIMZ)>1e-4) top = false; - if (fabs(EZ[vid]+DIMZ)>1e-4) bottom = false; - } - - if (right) offsetX = -2.0*DIMX; - if (left) offsetX = 2.0*DIMX; - if (back) offsetY = -2.0*DIMY; - if (front) offsetY = 2.0*DIMY; - if (top) offsetZ = -2.0*DIMZ; - if (bottom) offsetZ = 2.0*DIMZ; - } - - /* for each node on this face find the neighbor node */ - for(int n=0;n0) { + for (int n=0;n0){ + + // reset change counter + gatherChange = 0; + + // send halo data and recv into extension of buffer + halo.Exchange(globalIds, Np); + halo.Exchange(mapB, Np); + + // compare trace nodes + // #pragma omp parallel for + for(dlong e=0;e 0) { + if (bcM == -1) { + //if theres no bc here yet, write it + mapB[idM] = bcP; + ++gatherChange; + } else if (bcP(mapB); +} + +} //namespace libp diff --git a/libs/mesh/meshCubatureNodesHex3D.cpp b/libs/mesh/meshCubatureNodesHex3D.cpp index 5c9848a91..384ee5dd2 100644 --- a/libs/mesh/meshCubatureNodesHex3D.cpp +++ b/libs/mesh/meshCubatureNodesHex3D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,31 +25,32 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh3D.hpp" -void meshHex3D::CubatureNodes(){ +namespace libp { - cubx = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat)); - cuby = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat)); - cubz = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat)); +void mesh_t::CubaturePhysicalNodesHex3D(){ + + cubx.malloc(Nelements*cubNp); + cuby.malloc(Nelements*cubNp); + cubz.malloc(Nelements*cubNp); //temp arrays - dfloat *Ix1 = (dfloat*) calloc(Nq*Nq*cubNq, sizeof(dfloat)); - dfloat *Iy1 = (dfloat*) calloc(Nq*Nq*cubNq, sizeof(dfloat)); - dfloat *Iz1 = (dfloat*) calloc(Nq*Nq*cubNq, sizeof(dfloat)); + memory Ix1(Nq*Nq*cubNq); + memory Iy1(Nq*Nq*cubNq); + memory Iz1(Nq*Nq*cubNq); - dfloat *Ix2 = (dfloat*) calloc(Nq*cubNq*cubNq, sizeof(dfloat)); - dfloat *Iy2 = (dfloat*) calloc(Nq*cubNq*cubNq, sizeof(dfloat)); - dfloat *Iz2 = (dfloat*) calloc(Nq*cubNq*cubNq, sizeof(dfloat)); + memory Ix2(Nq*cubNq*cubNq); + memory Iy2(Nq*cubNq*cubNq); + memory Iz2(Nq*cubNq*cubNq); for(dlong e=0;e(Nelements*cubNp, cubx); + o_cuby = platform.malloc(Nelements*cubNp, cuby); + o_cubz = platform.malloc(Nelements*cubNp, cubz); //Face cubature - intx = (dfloat*) calloc(Nelements*Nfaces*cubNfp, sizeof(dfloat)); - inty = (dfloat*) calloc(Nelements*Nfaces*cubNfp, sizeof(dfloat)); - intz = (dfloat*) calloc(Nelements*Nfaces*cubNfp, sizeof(dfloat)); + intx.malloc(Nelements*Nfaces*cubNfp); + inty.malloc(Nelements*Nfaces*cubNfp); + intz.malloc(Nelements*Nfaces*cubNfp); - dfloat *ix = (dfloat *) calloc(cubNq*Nq,sizeof(dfloat)); - dfloat *iy = (dfloat *) calloc(cubNq*Nq,sizeof(dfloat)); - dfloat *iz = (dfloat *) calloc(cubNq*Nq,sizeof(dfloat)); + memory ix(cubNq*Nq); + memory iy(cubNq*Nq); + memory iz(cubNq*Nq); for(dlong e=0;e(Nelements*Nfaces*cubNfp, intx); + o_inty = platform.malloc(Nelements*Nfaces*cubNfp, inty); + o_intz = platform.malloc(Nelements*Nfaces*cubNfp, intz); } + +} //namespace libp diff --git a/libs/mesh/meshCubatureNodesQuad2D.cpp b/libs/mesh/meshCubatureNodesQuad2D.cpp index 70f40f46b..eecfa33a3 100644 --- a/libs/mesh/meshCubatureNodesQuad2D.cpp +++ b/libs/mesh/meshCubatureNodesQuad2D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,23 +25,24 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh2D.hpp" -void meshQuad2D::CubatureNodes(){ +namespace libp { - cubx = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat)); - cuby = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat)); +void mesh_t::CubaturePhysicalNodesQuad2D(){ + + cubx.malloc(Nelements*cubNp); + cuby.malloc(Nelements*cubNp); //temp arrays - dfloat *Ix1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat)); - dfloat *Iy1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat)); + memory Ix1(Nq*cubNq); + memory Iy1(Nq*cubNq); for(dlong e=0;e(Nelements*cubNp, cubx); + o_cuby = platform.malloc(Nelements*cubNp, cuby); //Face cubature - intx = (dfloat*) calloc(Nelements*Nfaces*cubNq, sizeof(dfloat)); - inty = (dfloat*) calloc(Nelements*Nfaces*cubNq, sizeof(dfloat)); + intx.malloc(Nelements*Nfaces*cubNq); + inty.malloc(Nelements*Nfaces*cubNq); for(dlong e=0;e(Nelements*Nfaces*cubNq, intx); + o_inty = platform.malloc(Nelements*Nfaces*cubNq, inty); } + +} //namespace libp diff --git a/libs/mesh/meshCubatureNodesQuad3D.cpp b/libs/mesh/meshCubatureNodesQuad3D.cpp index f50bca720..c0210f631 100644 --- a/libs/mesh/meshCubatureNodesQuad3D.cpp +++ b/libs/mesh/meshCubatureNodesQuad3D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,27 +25,28 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh3D.hpp" -void meshQuad3D::CubatureNodes(){ +namespace libp { - cubx = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat)); - cuby = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat)); - cubz = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat)); +void mesh_t::CubaturePhysicalNodesQuad3D(){ + + cubx.malloc(Nelements*cubNp); + cuby.malloc(Nelements*cubNp); + cubz.malloc(Nelements*cubNp); //temp arrays - dfloat *Ix1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat)); - dfloat *Iy1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat)); - dfloat *Iz1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat)); + memory Ix1(Nq*cubNq); + memory Iy1(Nq*cubNq); + memory Iz1(Nq*cubNq); for(dlong e=0;e(Nelements*cubNp, cubx); + o_cuby = platform.malloc(Nelements*cubNp, cuby); + o_cubz = platform.malloc(Nelements*cubNp, cubz); //Face cubature - intx = (dfloat*) calloc(Nelements*Nfaces*cubNq, sizeof(dfloat)); - inty = (dfloat*) calloc(Nelements*Nfaces*cubNq, sizeof(dfloat)); - intz = (dfloat*) calloc(Nelements*Nfaces*cubNq, sizeof(dfloat)); + intx.malloc(Nelements*Nfaces*cubNq); + inty.malloc(Nelements*Nfaces*cubNq); + intz.malloc(Nelements*Nfaces*cubNq); for(dlong e=0;e(Nelements*Nfaces*cubNq, intx); + o_inty = platform.malloc(Nelements*Nfaces*cubNq, inty); + o_intz = platform.malloc(Nelements*Nfaces*cubNq, intz); } + +} //namespace libp diff --git a/libs/mesh/meshCubatureNodesTet3D.cpp b/libs/mesh/meshCubatureNodesTet3D.cpp index a3cfccd8f..3e83718c0 100644 --- a/libs/mesh/meshCubatureNodesTet3D.cpp +++ b/libs/mesh/meshCubatureNodesTet3D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,14 +25,15 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh3D.hpp" -void meshTet3D::CubatureNodes(){ +namespace libp { + +void mesh_t::CubaturePhysicalNodesTet3D(){ if(cubNp){ - cubx = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat)); - cuby = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat)); - cubz = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat)); + cubx.malloc(Nelements*cubNp); + cuby.malloc(Nelements*cubNp); + cubz.malloc(Nelements*cubNp); dlong cnt = 0; for(dlong e=0;e(Nelements*cubNp, cubx); + o_cuby = platform.malloc(Nelements*cubNp, cuby); + o_cubz = platform.malloc(Nelements*cubNp, cubz); } //Face cubature if(intNfp){ // printf("Integration number of points: %d \n",intNfp); - intx = (dfloat*) calloc(Nelements*Nfaces*intNfp, sizeof(dfloat)); - inty = (dfloat*) calloc(Nelements*Nfaces*intNfp, sizeof(dfloat)); - intz = (dfloat*) calloc(Nelements*Nfaces*intNfp, sizeof(dfloat)); + intx.malloc(Nelements*Nfaces*intNfp); + inty.malloc(Nelements*Nfaces*intNfp); + intz.malloc(Nelements*Nfaces*intNfp); for(dlong e=0;e(Nelements*Nfaces*intNfp, intx); + o_inty = platform.malloc(Nelements*Nfaces*intNfp, inty); + o_intz = platform.malloc(Nelements*Nfaces*intNfp, intz); } } + +} //namespace libp diff --git a/libs/mesh/meshCubatureNodesTri2D.cpp b/libs/mesh/meshCubatureNodesTri2D.cpp index 2d3d9abfb..66ed2f959 100644 --- a/libs/mesh/meshCubatureNodesTri2D.cpp +++ b/libs/mesh/meshCubatureNodesTri2D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,12 +25,13 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh2D.hpp" -void meshTri2D::CubatureNodes(){ +namespace libp { - cubx = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat)); - cuby = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat)); +void mesh_t::CubaturePhysicalNodesTri2D(){ + + cubx.malloc(Nelements*cubNp); + cuby.malloc(Nelements*cubNp); dlong cnt = 0; for(dlong e=0;e(Nelements*cubNp, cubx); + o_cuby = platform.malloc(Nelements*cubNp, cuby); //Face cubature - intx = (dfloat*) calloc(Nelements*Nfaces*intNfp, sizeof(dfloat)); - inty = (dfloat*) calloc(Nelements*Nfaces*intNfp, sizeof(dfloat)); + intx.malloc(Nelements*Nfaces*intNfp); + inty.malloc(Nelements*Nfaces*intNfp); for(dlong e=0;e(Nelements*Nfaces*intNfp, intx); + o_inty = platform.malloc(Nelements*Nfaces*intNfp, inty); } + +} //namespace libp diff --git a/libs/mesh/meshCubatureNodesTri3D.cpp b/libs/mesh/meshCubatureNodesTri3D.cpp index 065fd1396..6203b971b 100644 --- a/libs/mesh/meshCubatureNodesTri3D.cpp +++ b/libs/mesh/meshCubatureNodesTri3D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,13 +25,14 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh3D.hpp" -void meshTri3D::CubatureNodes(){ +namespace libp { - cubx = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat)); - cuby = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat)); - cubz = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat)); +void mesh_t::CubaturePhysicalNodesTri3D(){ + + cubx.malloc(Nelements*cubNp); + cuby.malloc(Nelements*cubNp); + cubz.malloc(Nelements*cubNp); dlong cnt = 0; for(dlong e=0;e(Nelements*cubNp, cubx); + o_cuby = platform.malloc(Nelements*cubNp, cuby); + o_cubz = platform.malloc(Nelements*cubNp, cubz); //Face cubature - intx = (dfloat*) calloc(Nelements*Nfaces*intNfp, sizeof(dfloat)); - inty = (dfloat*) calloc(Nelements*Nfaces*intNfp, sizeof(dfloat)); - intz = (dfloat*) calloc(Nelements*Nfaces*intNfp, sizeof(dfloat)); + intx.malloc(Nelements*Nfaces*intNfp); + inty.malloc(Nelements*Nfaces*intNfp); + intz.malloc(Nelements*Nfaces*intNfp); for(dlong e=0;e(Nelements*Nfaces*intNfp, intx); + o_inty = platform.malloc(Nelements*Nfaces*intNfp, inty); + o_intz = platform.malloc(Nelements*Nfaces*intNfp, intz); } + +} //namespace libp diff --git a/libs/mesh/meshCubatureSetupHex3D.cpp b/libs/mesh/meshCubatureSetupHex3D.cpp index d2e05db24..a0dd5bc62 100644 --- a/libs/mesh/meshCubatureSetupHex3D.cpp +++ b/libs/mesh/meshCubatureSetupHex3D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,9 +25,10 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh3D.hpp" -void meshHex3D::CubatureSetup(){ +namespace libp { + +void mesh_t::CubatureSetupHex3D(){ /* Quadrature data */ cubN = N+1; @@ -37,24 +38,19 @@ void meshHex3D::CubatureSetup(){ intNfp = cubNq*cubNq; // cubN+1 point Gauss-Legendre quadrature - cubr = (dfloat *) malloc(cubNq*sizeof(dfloat)); - cubw = (dfloat *) malloc(cubNq*sizeof(dfloat)); JacobiGQ(0, 0, cubN, cubr, cubw); // GLL to GL interpolation matrix - cubInterp = (dfloat *) malloc(Nq*cubNq*sizeof(dfloat)); - InterpolationMatrix1D(N, Nq, r, cubNq, cubr, cubInterp); //uses the fact that r = gllz for 1:Nq + InterpolationMatrix1D(N, gllz, cubr, cubInterp); //cubature project cubProject = cubInterp^T - cubProject = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat)); - matrixTranspose(cubNq, Nq, cubInterp, Nq, cubProject, cubNq); + cubProject.malloc(cubNq*Nq); + linAlg_t::matrixTranspose(cubNq, Nq, cubInterp, Nq, cubProject, cubNq); //cubature derivates matrix, cubD: differentiate on cubature nodes - cubD = (dfloat *) malloc(cubNq*cubNq*sizeof(dfloat)); - Dmatrix1D(cubN, cubNq, cubr, cubNq, cubr, cubD); + Dmatrix1D(cubN, cubr, cubr, cubD); // weak cubature derivative cubPDT = cubProject * cubD^T - cubPDT = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat)); CubatureWeakDmatrix1D(Nq, cubNq, cubProject, cubD, cubPDT); // add compile time constants to kernels @@ -65,73 +61,69 @@ void meshHex3D::CubatureSetup(){ props["defines/" "p_cubNfp"]= cubNfp; // build transposes (we hold matrices as column major on device) - dfloat *cubProjectT = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat)); - dfloat *cubInterpT = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat)); - matrixTranspose(cubNq, Nq, cubInterp, Nq, cubInterpT, cubNq); - matrixTranspose(Nq, cubNq, cubProject, cubNq, cubProjectT, Nq); + memory cubProjectT(cubNq*Nq); + memory cubInterpT(cubNq*Nq); + linAlg_t::matrixTranspose(cubNq, Nq, cubInterp, Nq, cubInterpT, cubNq); + linAlg_t::matrixTranspose(Nq, cubNq, cubProject, cubNq, cubProjectT, Nq); - dfloat *cubPDTT = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat)); - matrixTranspose(Nq, cubNq, cubPDT, cubNq, cubPDTT, Nq); + memory cubPDTT(cubNq*Nq); + linAlg_t::matrixTranspose(Nq, cubNq, cubPDT, cubNq, cubPDTT, Nq); - o_cubInterp = platform.malloc(Nq*cubNq*sizeof(dfloat), cubInterpT); - o_cubProject = platform.malloc(Nq*cubNq*sizeof(dfloat), cubProjectT); + o_cubInterp = platform.malloc(Nq*cubNq, cubInterpT); + o_cubProject = platform.malloc(Nq*cubNq, cubProjectT); - o_cubPDT = platform.malloc(Nq*cubNq*sizeof(dfloat), cubPDTT); - o_cubD = platform.malloc(cubNq*cubNq*sizeof(dfloat), cubD); + o_cubPDT = platform.malloc(Nq*cubNq, cubPDTT); + o_cubD = platform.malloc(cubNq*cubNq, cubD); o_intInterp = o_cubInterp; o_intLIFT = o_cubProject; - free(cubPDTT); - free(cubProjectT); - free(cubInterpT); - - cubvgeo = (dfloat*) calloc(Nelements*Nvgeo*cubNp, sizeof(dfloat)); - cubggeo = (dfloat*) calloc(Nelements*Nggeo*cubNp, sizeof(dfloat)); - - cubsgeo = (dfloat*) calloc(Nelements*Nsgeo*cubNq*cubNq*Nfaces, sizeof(dfloat)); + cubwJ.malloc(Nelements*cubNp); + cubvgeo.malloc(Nelements*Nvgeo*cubNp); + cubggeo.malloc(Nelements*Nggeo*cubNp); + cubsgeo.malloc(Nelements*Nsgeo*cubNq*cubNq*Nfaces); //temp arrays - dfloat *xre = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *xse = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *xte = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *yre = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *yse = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *yte = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *zre = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *zse = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *zte = (dfloat*) calloc(Np, sizeof(dfloat)); - - dfloat *xre1 = (dfloat*) calloc(Nq*Nq*cubNq, sizeof(dfloat)); - dfloat *xse1 = (dfloat*) calloc(Nq*Nq*cubNq, sizeof(dfloat)); - dfloat *xte1 = (dfloat*) calloc(Nq*Nq*cubNq, sizeof(dfloat)); - dfloat *yre1 = (dfloat*) calloc(Nq*Nq*cubNq, sizeof(dfloat)); - dfloat *yse1 = (dfloat*) calloc(Nq*Nq*cubNq, sizeof(dfloat)); - dfloat *yte1 = (dfloat*) calloc(Nq*Nq*cubNq, sizeof(dfloat)); - dfloat *zre1 = (dfloat*) calloc(Nq*Nq*cubNq, sizeof(dfloat)); - dfloat *zse1 = (dfloat*) calloc(Nq*Nq*cubNq, sizeof(dfloat)); - dfloat *zte1 = (dfloat*) calloc(Nq*Nq*cubNq, sizeof(dfloat)); - - dfloat *xre2 = (dfloat*) calloc(Nq*cubNq*cubNq, sizeof(dfloat)); - dfloat *xse2 = (dfloat*) calloc(Nq*cubNq*cubNq, sizeof(dfloat)); - dfloat *xte2 = (dfloat*) calloc(Nq*cubNq*cubNq, sizeof(dfloat)); - dfloat *yre2 = (dfloat*) calloc(Nq*cubNq*cubNq, sizeof(dfloat)); - dfloat *yse2 = (dfloat*) calloc(Nq*cubNq*cubNq, sizeof(dfloat)); - dfloat *yte2 = (dfloat*) calloc(Nq*cubNq*cubNq, sizeof(dfloat)); - dfloat *zre2 = (dfloat*) calloc(Nq*cubNq*cubNq, sizeof(dfloat)); - dfloat *zse2 = (dfloat*) calloc(Nq*cubNq*cubNq, sizeof(dfloat)); - dfloat *zte2 = (dfloat*) calloc(Nq*cubNq*cubNq, sizeof(dfloat)); + memory xre(Np); + memory xse(Np); + memory xte(Np); + memory yre(Np); + memory yse(Np); + memory yte(Np); + memory zre(Np); + memory zse(Np); + memory zte(Np); + + memory xre1(Nq*Nq*cubNq); + memory xse1(Nq*Nq*cubNq); + memory xte1(Nq*Nq*cubNq); + memory yre1(Nq*Nq*cubNq); + memory yse1(Nq*Nq*cubNq); + memory yte1(Nq*Nq*cubNq); + memory zre1(Nq*Nq*cubNq); + memory zse1(Nq*Nq*cubNq); + memory zte1(Nq*Nq*cubNq); + + memory xre2(Nq*cubNq*cubNq); + memory xse2(Nq*cubNq*cubNq); + memory xte2(Nq*cubNq*cubNq); + memory yre2(Nq*cubNq*cubNq); + memory yse2(Nq*cubNq*cubNq); + memory yte2(Nq*cubNq*cubNq); + memory zre2(Nq*cubNq*cubNq); + memory zse2(Nq*cubNq*cubNq); + memory zte2(Nq*cubNq*cubNq); //surface temp arrays - dfloat *xr1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat)); - dfloat *xs1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat)); - dfloat *xt1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat)); - dfloat *yr1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat)); - dfloat *ys1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat)); - dfloat *yt1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat)); - dfloat *zr1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat)); - dfloat *zs1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat)); - dfloat *zt1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat)); + memory xr1(Nq*cubNq); + memory xs1(Nq*cubNq); + memory xt1(Nq*cubNq); + memory yr1(Nq*cubNq); + memory ys1(Nq*cubNq); + memory yt1(Nq*cubNq); + memory zr1(Nq*cubNq); + memory zs1(Nq*cubNq); + memory zt1(Nq*cubNq); //geometric data for quadrature for(dlong e=0;e(Nelements*cubNp, cubwJ); + o_cubvgeo = platform.malloc(Nelements*Nvgeo*cubNp, cubvgeo); + o_cubsgeo = platform.malloc(Nelements*Nfaces*cubNq*cubNq*Nsgeo, cubsgeo); + o_cubggeo = platform.malloc(Nelements*Nggeo*cubNp, cubggeo); } + +} //namespace libp diff --git a/libs/mesh/meshCubatureSetupQuad2D.cpp b/libs/mesh/meshCubatureSetupQuad2D.cpp index 10989098c..e806600f2 100644 --- a/libs/mesh/meshCubatureSetupQuad2D.cpp +++ b/libs/mesh/meshCubatureSetupQuad2D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,16 +25,10 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh2D.hpp" -#include "mesh/mesh3D.hpp" -void meshQuad3D::CubatureSetup(){ - mesh_t *mesh_p = (mesh_t*) this; - meshQuad2D* trimesh = (meshQuad2D*) mesh_p; - trimesh->meshQuad2D::CubatureSetup(); -} +namespace libp { -void meshQuad2D::CubatureSetup(){ +void mesh_t::CubatureSetupQuad2D(){ /* Quadrature data */ cubN = N+1; @@ -44,24 +38,19 @@ void meshQuad2D::CubatureSetup(){ intNfp = cubNq; // cubN+1 point Gauss-Legendre quadrature - cubr = (dfloat *) malloc(cubNq*sizeof(dfloat)); - cubw = (dfloat *) malloc(cubNq*sizeof(dfloat)); JacobiGQ(0, 0, cubN, cubr, cubw); // GLL to GL interpolation matrix - cubInterp = (dfloat *) malloc(Nq*cubNq*sizeof(dfloat)); - InterpolationMatrix1D(N, Nq, r, cubNq, cubr, cubInterp); //uses the fact that r = gllz for 1:Nq + InterpolationMatrix1D(N, gllz, cubr, cubInterp); //cubature project cubProject = cubInterp^T - cubProject = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat)); - matrixTranspose(cubNq, Nq, cubInterp, Nq, cubProject, cubNq); + cubProject.malloc(cubNq*Nq); + linAlg_t::matrixTranspose(cubNq, Nq, cubInterp, Nq, cubProject, cubNq); //cubature derivates matrix, cubD: differentiate on cubature nodes - cubD = (dfloat *) malloc(cubNq*cubNq*sizeof(dfloat)); - Dmatrix1D(cubN, cubNq, cubr, cubNq, cubr, cubD); + Dmatrix1D(cubN, cubr, cubr, cubD); // weak cubature derivative cubPDT = cubProject * cubD^T - cubPDT = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat)); CubatureWeakDmatrix1D(Nq, cubNq, cubProject, cubD, cubPDT); // add compile time constants to kernels @@ -72,42 +61,38 @@ void meshQuad2D::CubatureSetup(){ props["defines/" "p_cubNfp"]= cubNfp; // build transposes (we hold matrices as column major on device) - dfloat *cubProjectT = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat)); - dfloat *cubInterpT = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat)); - matrixTranspose(cubNq, Nq, cubInterp, Nq, cubInterpT, cubNq); - matrixTranspose(Nq, cubNq, cubProject, cubNq, cubProjectT, Nq); + memory cubProjectT(cubNq*Nq); + memory cubInterpT(cubNq*Nq); + linAlg_t::matrixTranspose(cubNq, Nq, cubInterp, Nq, cubInterpT, cubNq); + linAlg_t::matrixTranspose(Nq, cubNq, cubProject, cubNq, cubProjectT, Nq); - dfloat *cubPDTT = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat)); - matrixTranspose(Nq, cubNq, cubPDT, cubNq, cubPDTT, Nq); + memory cubPDTT(cubNq*Nq); + linAlg_t::matrixTranspose(Nq, cubNq, cubPDT, cubNq, cubPDTT, Nq); - o_cubInterp = platform.malloc(Nq*cubNq*sizeof(dfloat), cubInterpT); - o_cubProject = platform.malloc(Nq*cubNq*sizeof(dfloat), cubProjectT); + o_cubInterp = platform.malloc(Nq*cubNq, cubInterpT); + o_cubProject = platform.malloc(Nq*cubNq, cubProjectT); - o_cubPDT = platform.malloc(Nq*cubNq*sizeof(dfloat), cubPDTT); - o_cubD = platform.malloc(cubNq*cubNq*sizeof(dfloat), cubD); + o_cubPDT = platform.malloc(Nq*cubNq, cubPDTT); + o_cubD = platform.malloc(cubNq*cubNq, cubD); o_intInterp = o_cubInterp; o_intLIFT = o_cubProject; - free(cubPDTT); - free(cubProjectT); - free(cubInterpT); - - cubvgeo = (dfloat*) calloc(Nelements*Nvgeo*cubNp, sizeof(dfloat)); - cubggeo = (dfloat*) calloc(Nelements*Nggeo*cubNp, sizeof(dfloat)); - - cubsgeo = (dfloat*) calloc(Nelements*Nsgeo*cubNq*Nfaces, sizeof(dfloat)); + cubwJ.malloc(Nelements*cubNp); + cubvgeo.malloc(Nelements*Nvgeo*cubNp); + cubggeo.malloc(Nelements*Nggeo*cubNp); + cubsgeo.malloc(Nelements*Nsgeo*cubNq*Nfaces); //temp arrays - dfloat *xre = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *xse = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *yre = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *yse = (dfloat*) calloc(Np, sizeof(dfloat)); + memory xre(Np); + memory xse(Np); + memory yre(Np); + memory yse(Np); - dfloat *xre1 = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat)); - dfloat *xse1 = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat)); - dfloat *yre1 = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat)); - dfloat *yse1 = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat)); + memory xre1(cubNq*Nq); + memory xse1(cubNq*Nq); + memory yre1(cubNq*Nq); + memory yse1(cubNq*Nq); //geometric data for quadrature for(dlong e=0;e(Nelements*cubNp, cubwJ); + o_cubvgeo = platform.malloc(Nelements*Nvgeo*cubNp, cubvgeo); + o_cubggeo = platform.malloc(Nelements*Nggeo*cubNp, cubggeo); + o_cubsgeo = platform.malloc(Nelements*Nfaces*cubNq*Nsgeo, cubsgeo); } + +} //namespace libp diff --git a/libs/mesh/meshCubatureSetupTet3D.cpp b/libs/mesh/meshCubatureSetupTet3D.cpp index 52817c813..a24b9f706 100644 --- a/libs/mesh/meshCubatureSetupTet3D.cpp +++ b/libs/mesh/meshCubatureSetupTet3D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,41 +25,39 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh3D.hpp" -void meshTet3D::CubatureSetup(){ +namespace libp { + +void mesh_t::CubatureSetupTet3D(){ /* Cubature data */ cubN = 2*N; //cubature order - CubatureNodesTet3D(cubN, &cubNp, &cubr, &cubs, &cubt, &cubw); + CubatureNodesTet3D(cubN, cubNp, cubr, cubs, cubt, cubw); - cubInterp = (dfloat *) malloc(Np*cubNp*sizeof(dfloat)); - InterpolationMatrixTet3D(N, Np, r, s, t, cubNp, cubr, cubs, cubt, cubInterp); + InterpolationMatrixTet3D(N, r, s, t, cubr, cubs, cubt, cubInterp); //cubature project cubProject = M^{-1} * cubInterp^T // Defined such that cubProject * cubW * cubInterp = Identity - cubProject = (dfloat*) calloc(cubNp*Np, sizeof(dfloat)); - CubaturePmatrixTet3D(N, Np, r, s, t, cubNp, cubr, cubs, cubt, cubProject); + CubaturePmatrixTet3D(N, r, s, t, cubr, cubs, cubt, cubProject); //cubature derivates matrices, cubD: differentiate on cubature nodes // we dont use cubD on Tris/Tets so skip computing // Instead, it's cheaper to: // make weak cubature derivatives cubPDT = cubProject * cubD^T - cubPDT = (dfloat*) calloc(3*cubNp*Np, sizeof(dfloat)); + CubatureWeakDmatricesTet3D(N, r, s, t, + cubr, cubs, cubt, + cubPDT); cubPDrT = cubPDT + 0*cubNp*Np; cubPDsT = cubPDT + 1*cubNp*Np; cubPDtT = cubPDT + 2*cubNp*Np; - CubatureWeakDmatricesTet3D(N, Np, r, s, t, cubNp, cubr, cubs, cubt, - cubPDrT, cubPDsT, cubPDtT); // Surface cubature nodes - CubatureNodesTri2D(cubN, &intNfp, &intr, &ints, &intw); + CubatureNodesTri2D(cubN, intNfp, intr, ints, intw); cubNfp = intNfp; - intInterp = (dfloat*) calloc(intNfp*Nfaces*Nfp, sizeof(dfloat)); - intLIFT = (dfloat*) calloc(Nfaces*intNfp*Np, sizeof(dfloat)); - CubatureSurfaceMatricesTet3D(N, Np, r, s, t, faceNodes, intNfp, intr, ints, intw, + CubatureSurfaceMatricesTet3D(N, r, s, t, faceNodes, + intr, ints, intw, intInterp, intLIFT); // add compile time constants to kernels @@ -70,10 +68,10 @@ void meshTet3D::CubatureSetup(){ props["defines/" "p_cubNfp"]= cubNfp; // build transposes (we hold matrices as column major on device) - dfloat *cubProjectT = (dfloat*) calloc(cubNp*Np, sizeof(dfloat)); - dfloat *cubInterpT = (dfloat*) calloc(cubNp*Np, sizeof(dfloat)); - matrixTranspose(cubNp, Np, cubInterp, Np, cubInterpT, cubNp); - matrixTranspose(Np, cubNp, cubProject, cubNp, cubProjectT, Np); + memory cubProjectT(cubNp*Np); + memory cubInterpT(cubNp*Np); + linAlg_t::matrixTranspose(cubNp, Np, cubInterp, Np, cubInterpT, cubNp); + linAlg_t::matrixTranspose(Np, cubNp, cubProject, cubNp, cubProjectT, Np); //pre-multiply cubProject by W on device for(int n=0;n cubPDTT(3*cubNp*Np); + memory cubPDrTT = cubPDTT + 0*cubNp*Np; + memory cubPDsTT = cubPDTT + 1*cubNp*Np; + memory cubPDtTT = cubPDTT + 2*cubNp*Np; + linAlg_t::matrixTranspose(Np, cubNp, cubPDrT, cubNp, cubPDrTT, Np); + linAlg_t::matrixTranspose(Np, cubNp, cubPDsT, cubNp, cubPDsTT, Np); + linAlg_t::matrixTranspose(Np, cubNp, cubPDtT, cubNp, cubPDtTT, Np); //pre-multiply cubPDT by W on device for(int n=0;n intLIFTT(Np*Nfaces*intNfp); + memory intInterpT(Nfp*Nfaces*intNfp); + linAlg_t::matrixTranspose(Np, Nfaces*intNfp, intLIFT, Nfaces*intNfp, intLIFTT, Np); + linAlg_t::matrixTranspose(Nfaces*intNfp, Nfp, intInterp, Nfp, intInterpT, Nfaces*intNfp); - o_cubvgeo = o_vgeo;// dummy - o_cubsgeo = o_sgeo; //dummy cubature geo factors + o_cubInterp = platform.malloc(Np*cubNp, cubInterpT); + o_cubProject = platform.malloc(Np*cubNp, cubProjectT); - o_cubInterp = platform.malloc(Np*cubNp*sizeof(dfloat), cubInterpT); - o_cubProject = platform.malloc(Np*cubNp*sizeof(dfloat), cubProjectT); + o_cubPDT = platform.malloc(3*Np*cubNp, cubPDTT); - o_cubPDT = platform.malloc(3*Np*cubNp*sizeof(dfloat), cubPDTT); - o_cubD = o_cubPDT; //dummy + o_intInterp = platform.malloc(Nfp*Nfaces*intNfp, intInterpT); + o_intLIFT = platform.malloc(Np*Nfaces*intNfp, intLIFTT); - o_intInterp = platform.malloc(Nfp*Nfaces*intNfp*sizeof(dfloat), intInterpT); - o_intLIFT = platform.malloc(Np*Nfaces*intNfp*sizeof(dfloat), intLIFTT); - - free(cubPDTT); - free(cubProjectT); - free(cubInterpT); - free(intLIFTT); - free(intInterpT); + o_cubwJ = o_wJ; + o_cubvgeo = o_vgeo; + o_cubggeo = o_ggeo; + o_cubsgeo = o_sgeo; } + +} //namespace libp diff --git a/libs/mesh/meshCubatureSetupTri2D.cpp b/libs/mesh/meshCubatureSetupTri2D.cpp index a342f5e0f..e7c55785b 100644 --- a/libs/mesh/meshCubatureSetupTri2D.cpp +++ b/libs/mesh/meshCubatureSetupTri2D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,50 +25,40 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh2D.hpp" -#include "mesh/mesh3D.hpp" -void meshTri3D::CubatureSetup(){ - mesh_t *mesh_p = (mesh_t*) this; - meshTri2D* trimesh = (meshTri2D*) mesh_p; - trimesh->meshTri2D::CubatureSetup(); -} +namespace libp { -void meshTri2D::CubatureSetup(){ +void mesh_t::CubatureSetupTri2D(){ /* Cubature data */ cubN = 2*N; //cubature order - CubatureNodesTri2D(cubN, &cubNp, &cubr, &cubs, &cubw); + CubatureNodesTri2D(cubN, cubNp, cubr, cubs, cubw); - cubInterp = (dfloat *) malloc(Np*cubNp*sizeof(dfloat)); - InterpolationMatrixTri2D(N, Np, r, s, cubNp, cubr, cubs, cubInterp); + InterpolationMatrixTri2D(N, r, s, cubr, cubs, cubInterp); //cubature project cubProject = M^{-1} * cubInterp^T // Defined such that cubProject * cubW * cubInterp = Identity - cubProject = (dfloat*) calloc(cubNp*Np, sizeof(dfloat)); - CubaturePmatrixTri2D(N, Np, r, s, cubNp, cubr, cubs, cubProject); + CubaturePmatrixTri2D(N, r, s, cubr, cubs, cubProject); //cubature derivates matrices, cubD: differentiate on cubature nodes // we dont use cubD on Tris/Tets so skip computing // Instead, it's cheaper to: // make weak cubature derivatives cubPDT = cubProject * cubD^T - cubPDT = (dfloat*) calloc(2*cubNp*Np, sizeof(dfloat)); + CubatureWeakDmatricesTri2D(N, r, s, + cubr, cubs, + cubPDT); cubPDrT = cubPDT + 0*cubNp*Np; cubPDsT = cubPDT + 1*cubNp*Np; - CubatureWeakDmatricesTri2D(N, Np, r, s, cubNp, cubr, cubs, cubPDrT, cubPDsT); // cubN+1 point Gauss-Legendre quadrature for surface integrals cubNq = cubN+1; cubNfp = cubN+1; intNfp = cubN+1; - intr = (dfloat *) malloc(cubNfp*sizeof(dfloat)); - intw = (dfloat *) malloc(cubNfp*sizeof(dfloat)); JacobiGQ(0, 0, cubN, intr, intw); - intInterp = (dfloat*) calloc(intNfp*Nfaces*Nfp, sizeof(dfloat)); - intLIFT = (dfloat*) calloc(Nfaces*intNfp*Np, sizeof(dfloat)); - CubatureSurfaceMatricesTri2D(N, Np, r, s, faceNodes, intNfp, intr, intw, + CubatureSurfaceMatricesTri2D(N, r, s, faceNodes, + intr, intw, intInterp, intLIFT); // add compile time constants to kernels @@ -79,10 +69,10 @@ void meshTri2D::CubatureSetup(){ props["defines/" "p_cubNfp"]= cubNfp; // build transposes (we hold matrices as column major on device) - dfloat *cubProjectT = (dfloat*) calloc(cubNp*Np, sizeof(dfloat)); - dfloat *cubInterpT = (dfloat*) calloc(cubNp*Np, sizeof(dfloat)); - matrixTranspose(cubNp, Np, cubInterp, Np, cubInterpT, cubNp); - matrixTranspose(Np, cubNp, cubProject, cubNp, cubProjectT, Np); + memory cubProjectT(cubNp*Np); + memory cubInterpT(cubNp*Np); + linAlg_t::matrixTranspose(cubNp, Np, cubInterp, Np, cubInterpT, cubNp); + linAlg_t::matrixTranspose(Np, cubNp, cubProject, cubNp, cubProjectT, Np); //pre-multiply cubProject by W on device for(int n=0;n cubPDTT(2*cubNp*Np); + memory cubPDrTT = cubPDTT + 0*cubNp*Np; + memory cubPDsTT = cubPDTT + 1*cubNp*Np; + linAlg_t::matrixTranspose(Np, cubNp, cubPDrT, cubNp, cubPDrTT, Np); + linAlg_t::matrixTranspose(Np, cubNp, cubPDsT, cubNp, cubPDsTT, Np); //pre-multiply cubPDT by W on device for(int n=0;n intLIFTT(Np*Nfaces*intNfp); + memory intInterpT(Nfp*Nfaces*intNfp); + linAlg_t::matrixTranspose(Np, Nfaces*intNfp, intLIFT, Nfaces*intNfp, intLIFTT, Np); + linAlg_t::matrixTranspose(Nfaces*intNfp, Nfp, intInterp, Nfp, intInterpT, Nfaces*intNfp); - o_cubInterp = platform.malloc(Np*cubNp*sizeof(dfloat), cubInterpT); - o_cubProject = platform.malloc(Np*cubNp*sizeof(dfloat), cubProjectT); + o_cubInterp = platform.malloc(Np*cubNp, cubInterpT); + o_cubProject = platform.malloc(Np*cubNp, cubProjectT); - o_cubPDT = platform.malloc(2*cubNp*Np*sizeof(dfloat), cubPDTT); - o_cubD = o_cubPDT; //dummy + o_cubPDT = platform.malloc(2*cubNp*Np, cubPDTT); - o_intInterp = platform.malloc(Nfp*Nfaces*intNfp*sizeof(dfloat), intInterpT); - o_intLIFT = platform.malloc(Np*Nfaces*intNfp*sizeof(dfloat), intLIFTT); + o_intInterp = platform.malloc(Nfp*Nfaces*intNfp, intInterpT); + o_intLIFT = platform.malloc(Np*Nfaces*intNfp, intLIFTT); - free(cubPDTT); - free(cubProjectT); - free(cubInterpT); - free(intLIFTT); - free(intInterpT); + o_cubwJ = o_wJ; + o_cubvgeo = o_vgeo; + o_cubggeo = o_ggeo; + o_cubsgeo = o_sgeo; } + +} //namespace libp diff --git a/libs/mesh/meshParallelGatherScatterSetup.cpp b/libs/mesh/meshGatherScatterSetup.cpp similarity index 53% rename from libs/mesh/meshParallelGatherScatterSetup.cpp rename to libs/mesh/meshGatherScatterSetup.cpp index 88e034803..dcb978041 100644 --- a/libs/mesh/meshParallelGatherScatterSetup.cpp +++ b/libs/mesh/meshGatherScatterSetup.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -24,33 +24,72 @@ SOFTWARE. */ + #include "mesh.hpp" -void mesh_t::ParallelGatherScatterSetup() { +namespace libp { + +void mesh_t::GatherScatterSetup() { - dlong Ntotal = Np*Nelements; + dlong Ntotal = Nverts*(Nelements+totalHaloPairs); - int verbose = 0; - ogs = ogs_t::Setup(Ntotal, globalIds, comm, verbose, platform); + memory minRank(Ntotal); + memory maxRank(Ntotal); - //use the gs to find what nodes are local to this rank - int *minRank = (int *) calloc(Ntotal,sizeof(int)); - int *maxRank = (int *) calloc(Ntotal,sizeof(int)); for (dlong i=0;iGatherScatter(minRank, ogs_int, ogs_min, ogs_sym); //minRank[n] contains the smallest rank taking part in the gather of node n - ogs->GatherScatter(maxRank, ogs_int, ogs_max, ogs_sym); //maxRank[n] contains the largest rank taking part in the gather of node n + hlong gatherChange = 1; + + // keep comparing numbers on positive and negative traces until convergence + while(gatherChange>0){ + + // reset change counter + gatherChange = 0; + + // send halo data and recv into extension of buffer + halo.Exchange(minRank, Nverts); + halo.Exchange(maxRank, Nverts); + + // compare trace vertices + #pragma omp parallel for collapse(2) + for(dlong e=0;emaxRankM){ + gatherChange=1; + maxRank[idM] = maxRankP; + } + } + } + + // sum up changes + comm.Allreduce(gatherChange); + } // count elements that contribute to global C0 gather-scatter dlong globalCount = 0; dlong localCount = 0; for(dlong e=0;e(globalGatherElementList); + o_localGatherElementList = platform.malloc(localGatherElementList); } + +} //namespace libp diff --git a/libs/mesh/meshGeometricFactorsHex3D.cpp b/libs/mesh/meshGeometricFactorsHex3D.cpp index a45f5aa8c..1aae6b50e 100644 --- a/libs/mesh/meshGeometricFactorsHex3D.cpp +++ b/libs/mesh/meshGeometricFactorsHex3D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,23 +25,73 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh3D.hpp" -void meshHex3D::GeometricFactors(){ +namespace libp { - /* unified storage array for geometric factors */ +void mesh_t::GeometricFactorsHex3D(){ + + /*Set offsets*/ Nvgeo = 12; + RXID = 0; + RYID = 1; + RZID = 2; + SXID = 3; + SYID = 4; + SZID = 5; + TXID = 6; + TYID = 7; + TZID = 8; + JID = 9; + JWID = 10; + IJWID = 11; + + props["defines/" "p_Nvgeo"]= Nvgeo; + props["defines/" "p_RXID"]= RXID; + props["defines/" "p_SXID"]= SXID; + props["defines/" "p_TXID"]= TXID; + + props["defines/" "p_RYID"]= RYID; + props["defines/" "p_SYID"]= SYID; + props["defines/" "p_TYID"]= TYID; + + props["defines/" "p_RZID"]= RZID; + props["defines/" "p_SZID"]= SZID; + props["defines/" "p_TZID"]= TZID; + + props["defines/" "p_JID"]= JID; + props["defines/" "p_JWID"]= JWID; + props["defines/" "p_IJWID"]= IJWID; + + /* unified storage array for geometric factors */ /* note that we have volume geometric factors for each node */ - vgeo = (dfloat*) calloc((Nelements+totalHaloPairs)*Nvgeo*Np, sizeof(dfloat)); + vgeo.malloc((Nelements+totalHaloPairs)*Nvgeo*Np); + + Nggeo = 6; + + G00ID=0; + G01ID=1; + G02ID=2; + G11ID=3; + G12ID=4; + G22ID=5; + + props["defines/" "p_Nggeo"]= Nggeo; + props["defines/" "p_G00ID"]= G00ID; + props["defines/" "p_G01ID"]= G01ID; + props["defines/" "p_G02ID"]= G02ID; + props["defines/" "p_G11ID"]= G11ID; + props["defines/" "p_G12ID"]= G12ID; + props["defines/" "p_G22ID"]= G22ID; /* number of second order geometric factors */ - Nggeo = 7; + ggeo.malloc(Nelements*Nggeo*Np); - ggeo = (dfloat*) calloc(Nelements*Nggeo*Np, sizeof(dfloat)); + wJ.malloc(Nelements*Np); // dfloat minJ = 1e9, maxJ = -1e9, maxSkew = 0; + #pragma omp parallel for for(dlong e=0;e(wJ); + o_vgeo = platform.malloc(vgeo); + o_ggeo = platform.malloc(ggeo); + + #if 0 dfloat globalMinJ, globalMaxJ, globalMaxSkew; @@ -135,6 +190,6 @@ void meshHex3D::GeometricFactors(){ if(rank==0) printf("J in range [%g,%g] and max Skew = %g\n", globalMinJ, globalMaxJ, globalMaxSkew); #endif - - halo->Exchange(vgeo, Nvgeo*Np, ogs_dfloat); } + +} //namespace libp diff --git a/libs/mesh/meshGeometricFactorsQuad2D.cpp b/libs/mesh/meshGeometricFactorsQuad2D.cpp index 343de3525..32d04377a 100644 --- a/libs/mesh/meshGeometricFactorsQuad2D.cpp +++ b/libs/mesh/meshGeometricFactorsQuad2D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,20 +25,52 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh2D.hpp" -void meshQuad2D::GeometricFactors(){ +namespace libp { - /* unified storage array for geometric factors */ +void mesh_t::GeometricFactorsQuad2D(){ + + /*Set offsets*/ Nvgeo = 7; + RXID = 0; + RYID = 1; + SXID = 2; + SYID = 3; + JID = 4; + JWID = 5; + IJWID = 6; + + props["defines/" "p_Nvgeo"]= Nvgeo; + props["defines/" "p_RXID"]= RXID; + props["defines/" "p_SXID"]= SXID; + props["defines/" "p_RYID"]= RYID; + props["defines/" "p_SYID"]= SYID; + props["defines/" "p_JID"]= JID; + props["defines/" "p_JWID"]= JWID; + props["defines/" "p_IJWID"]= IJWID; + + /* unified storage array for geometric factors */ /* note that we have volume geometric factors for each node */ - vgeo = (dfloat*) calloc((Nelements+totalHaloPairs)*Nvgeo*Np, sizeof(dfloat)); + vgeo.malloc((Nelements+totalHaloPairs)*Nvgeo*Np); + + Nggeo = 3; + + G00ID=0; + G01ID=1; + G11ID=2; + + props["defines/" "p_Nggeo"]= Nggeo; + props["defines/" "p_G00ID"]= G00ID; + props["defines/" "p_G01ID"]= G01ID; + props["defines/" "p_G11ID"]= G11ID; /* number of second order geometric factors */ - Nggeo = 4; - ggeo = (dfloat*) calloc(Nelements*Nggeo*Np, sizeof(dfloat)); + ggeo.malloc(Nelements*Nggeo*Np); + wJ.malloc(Nelements*Np); + + #pragma omp parallel for for(dlong e=0;eExchange(vgeo, Nvgeo*Np, ogs_dfloat); + halo.Exchange(vgeo, Nvgeo*Np); + + o_wJ = platform.malloc(wJ); + o_vgeo = platform.malloc(vgeo); + o_ggeo = platform.malloc(ggeo); } + +} //namespace libp diff --git a/libs/mesh/meshGeometricFactorsQuad3D.cpp b/libs/mesh/meshGeometricFactorsQuad3D.cpp index 361a95102..9c20c240e 100644 --- a/libs/mesh/meshGeometricFactorsQuad3D.cpp +++ b/libs/mesh/meshGeometricFactorsQuad3D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,239 +25,284 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh3D.hpp" + +namespace libp { // custom geometric factors specialized for 3D quad on sphere -void meshQuad3D::GeometricFactors(){ +void mesh_t::GeometricFactorsQuad3D(){ + + /*Set offsets*/ + Nvgeo = 12; + + RXID = 0; + RYID = 1; + RZID = 2; + SXID = 3; + SYID = 4; + SZID = 5; + TXID = 6; + TYID = 7; + TZID = 8; + JID = 9; + JWID = 10; + IJWID = 11; + + props["defines/" "p_Nvgeo"]= Nvgeo; + props["defines/" "p_RXID"]= RXID; + props["defines/" "p_SXID"]= SXID; + props["defines/" "p_TXID"]= TXID; + + props["defines/" "p_RYID"]= RYID; + props["defines/" "p_SYID"]= SYID; + props["defines/" "p_TYID"]= TYID; + + props["defines/" "p_RZID"]= RZID; + props["defines/" "p_SZID"]= SZID; + props["defines/" "p_TZID"]= TZID; + + props["defines/" "p_JID"]= JID; + props["defines/" "p_JWID"]= JWID; + props["defines/" "p_IJWID"]= IJWID; /* unified storage array for geometric factors */ - Nvgeo = 12; // - /* note that we have volume geometric factors for each node */ - vgeo = (dfloat*) calloc((Nelements+totalHaloPairs)*Nvgeo*Np, sizeof(dfloat)); - - cubvgeo = (dfloat*) calloc(Nelements*Nvgeo*cubNp, sizeof(dfloat)); - - // Can be computed on the fly - Nggeo = 7; - ggeo = (dfloat *) calloc(Nelements*Np*Nggeo, sizeof(dfloat)); - - dfloat *cxr = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat)); - dfloat *cxs = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat)); - dfloat *cyr = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat)); - dfloat *cys = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat)); - dfloat *czr = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat)); - dfloat *czs = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat)); - dfloat *cx = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat)); - dfloat *cy = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat)); - dfloat *cz = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat)); + vgeo.malloc((Nelements+totalHaloPairs)*Nvgeo*Np); + + Nggeo = 6; + + G00ID=0; + G01ID=1; + G02ID=2; + G11ID=3; + G12ID=4; + G22ID=5; + + props["defines/" "p_Nggeo"]= Nggeo; + props["defines/" "p_G00ID"]= G00ID; + props["defines/" "p_G01ID"]= G01ID; + props["defines/" "p_G02ID"]= G02ID; + props["defines/" "p_G11ID"]= G11ID; + props["defines/" "p_G12ID"]= G12ID; + props["defines/" "p_G22ID"]= G22ID; + + /* number of second order geometric factors */ + ggeo.malloc(Nelements*Nggeo*Np); + + wJ.malloc(Nelements*Np); + + // dfloat *cxr = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat)); + // dfloat *cxs = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat)); + // dfloat *cyr = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat)); + // dfloat *cys = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat)); + // dfloat *czr = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat)); + // dfloat *czs = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat)); + // dfloat *cx = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat)); + // dfloat *cy = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat)); + // dfloat *cz = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat)); for(int e=0;eExchange(vgeo, Nvgeo*Np, ogs_dfloat); + halo.Exchange(vgeo, Nvgeo*Np); + + o_wJ = platform.malloc(wJ); + o_vgeo = platform.malloc(vgeo); + o_ggeo = platform.malloc(ggeo); } + +} //namespace libp diff --git a/libs/mesh/meshGeometricFactorsTet3D.cpp b/libs/mesh/meshGeometricFactorsTet3D.cpp index 7afed3694..5ab3dfe06 100644 --- a/libs/mesh/meshGeometricFactorsTet3D.cpp +++ b/libs/mesh/meshGeometricFactorsTet3D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,20 +25,69 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh3D.hpp" -void meshTet3D::GeometricFactors(){ +namespace libp { + +void mesh_t::GeometricFactorsTet3D(){ + + /*Set offsets*/ + Nvgeo = 10; + + RXID = 0; + RYID = 1; + RZID = 2; + SXID = 3; + SYID = 4; + SZID = 5; + TXID = 6; + TYID = 7; + TZID = 8; + JID = 9; + + props["defines/" "p_Nvgeo"]= Nvgeo; + props["defines/" "p_RXID"]= RXID; + props["defines/" "p_SXID"]= SXID; + props["defines/" "p_TXID"]= TXID; + + props["defines/" "p_RYID"]= RYID; + props["defines/" "p_SYID"]= SYID; + props["defines/" "p_TYID"]= TYID; + + props["defines/" "p_RZID"]= RZID; + props["defines/" "p_SZID"]= SZID; + props["defines/" "p_TZID"]= TZID; + + props["defines/" "p_JID"]= JID; /* unified storage array for geometric factors */ - Nvgeo = 12; - vgeo = (dfloat*) calloc((Nelements+totalHaloPairs)*Nvgeo, sizeof(dfloat)); + vgeo.malloc((Nelements+totalHaloPairs)*Nvgeo); + + Nggeo = 6; + + G00ID=0; + G01ID=1; + G02ID=2; + G11ID=3; + G12ID=4; + G22ID=5; + + props["defines/" "p_Nggeo"]= Nggeo; + props["defines/" "p_G00ID"]= G00ID; + props["defines/" "p_G01ID"]= G01ID; + props["defines/" "p_G02ID"]= G02ID; + props["defines/" "p_G11ID"]= G11ID; + props["defines/" "p_G12ID"]= G12ID; + props["defines/" "p_G22ID"]= G22ID; /* number of second order geometric factors */ - Nggeo = 7; - ggeo = (dfloat*) calloc(Nelements*Nggeo, sizeof(dfloat)); + ggeo.malloc(Nelements*Nggeo); + + wJ.malloc(Nelements); + + // dfloat minJ = 1e9, maxJ = -1e9; - dfloat minJ = 1e9, maxJ = -1e9; + #pragma omp parallel for for(dlong e=0;eExchange(vgeo, Nvgeo, ogs_dfloat); + + halo.Exchange(vgeo, Nvgeo); + + o_wJ = platform.malloc(wJ); + o_vgeo = platform.malloc(vgeo); + o_ggeo = platform.malloc(ggeo); } + +} //namespace libp diff --git a/libs/mesh/meshGeometricFactorsTri2D.cpp b/libs/mesh/meshGeometricFactorsTri2D.cpp index 7087cf948..9560b5e02 100644 --- a/libs/mesh/meshGeometricFactorsTri2D.cpp +++ b/libs/mesh/meshGeometricFactorsTri2D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,19 +25,47 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh2D.hpp" -void meshTri2D::GeometricFactors(){ +namespace libp { - /* unified storage array for geometric factors */ +void mesh_t::GeometricFactorsTri2D(){ + + /*Set offsets*/ Nvgeo = 5; - vgeo = (dfloat*) calloc((Nelements+totalHaloPairs)*Nvgeo, sizeof(dfloat)); + + RXID = 0; + RYID = 1; + SXID = 2; + SYID = 3; + JID = 4; + + props["defines/" "p_Nvgeo"]= Nvgeo; + props["defines/" "p_RXID"]= RXID; + props["defines/" "p_SXID"]= SXID; + props["defines/" "p_RYID"]= RYID; + props["defines/" "p_SYID"]= SYID; + props["defines/" "p_JID"]= JID; + + /* unified storage array for geometric factors */ + vgeo.malloc((Nelements+totalHaloPairs)*Nvgeo); + + Nggeo = 3; + + G00ID=0; + G01ID=1; + G11ID=2; + + props["defines/" "p_Nggeo"]= Nggeo; + props["defines/" "p_G00ID"]= G00ID; + props["defines/" "p_G01ID"]= G01ID; + props["defines/" "p_G11ID"]= G11ID; /* number of second order geometric factors */ - Nggeo = 4; - ggeo = (dfloat*) calloc(Nelements*Nggeo, sizeof(dfloat)); + ggeo.malloc(Nelements*Nggeo); + wJ.malloc(Nelements); + #pragma omp parallel for for(dlong e=0;eExchange(vgeo, Nvgeo, ogs_dfloat); + halo.Exchange(vgeo, Nvgeo); + + o_wJ = platform.malloc(wJ); + o_vgeo = platform.malloc(vgeo); + o_ggeo = platform.malloc(ggeo); } + +} //namespace libp diff --git a/libs/mesh/meshGeometricFactorsTri3D.cpp b/libs/mesh/meshGeometricFactorsTri3D.cpp index bcac3c639..f9558f829 100644 --- a/libs/mesh/meshGeometricFactorsTri3D.cpp +++ b/libs/mesh/meshGeometricFactorsTri3D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,22 +25,69 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh3D.hpp" // custom geometric factors specialized for 3D tri on sphere -void meshTri3D::GeometricFactors(){ +namespace libp { - /* unified storage array for geometric factors */ - Nvgeo = 12; // +void mesh_t::GeometricFactorsTri3D(){ + + /*Set offsets*/ + Nvgeo = 10; + + RXID = 0; + RYID = 1; + RZID = 2; + SXID = 3; + SYID = 4; + SZID = 5; + TXID = 6; + TYID = 7; + TZID = 8; + JID = 9; + + props["defines/" "p_Nvgeo"]= Nvgeo; + props["defines/" "p_RXID"]= RXID; + props["defines/" "p_SXID"]= SXID; + props["defines/" "p_TXID"]= TXID; + props["defines/" "p_RYID"]= RYID; + props["defines/" "p_SYID"]= SYID; + props["defines/" "p_TYID"]= TYID; + + props["defines/" "p_RZID"]= RZID; + props["defines/" "p_SZID"]= SZID; + props["defines/" "p_TZID"]= TZID; + + props["defines/" "p_JID"]= JID; + + /* unified storage array for geometric factors */ /* note that we have volume geometric factors for each node */ - vgeo = (dfloat*) calloc((Nelements+totalHaloPairs)*Nvgeo*Np, sizeof(dfloat)); + vgeo.malloc((Nelements+totalHaloPairs)*Nvgeo*Np); + + Nggeo = 6; + + G00ID=0; + G01ID=1; + G02ID=2; + G11ID=3; + G12ID=4; + G22ID=5; + + props["defines/" "p_Nggeo"]= Nggeo; + props["defines/" "p_G00ID"]= G00ID; + props["defines/" "p_G01ID"]= G01ID; + props["defines/" "p_G02ID"]= G02ID; + props["defines/" "p_G11ID"]= G11ID; + props["defines/" "p_G12ID"]= G12ID; + props["defines/" "p_G22ID"]= G22ID; /* number of second order geometric factors */ - Nggeo = 7; - ggeo = (dfloat*) calloc(Nelements*Nggeo, sizeof(dfloat)); + ggeo.malloc(Nelements*Nggeo*Np); + + wJ.malloc(Nelements*Np); + #pragma omp parallel for for(int e=0;eExchange(vgeo, Nvgeo*Np, ogs_dfloat); + halo.Exchange(vgeo, Nvgeo*Np); + + o_wJ = platform.malloc(wJ); + o_vgeo = platform.malloc(vgeo); + o_ggeo = platform.malloc(ggeo); } + +} //namespace libp diff --git a/libs/mesh/meshGeometricPartition2D.cpp b/libs/mesh/meshGeometricPartition2D.cpp deleted file mode 100644 index 80770a7f3..000000000 --- a/libs/mesh/meshGeometricPartition2D.cpp +++ /dev/null @@ -1,396 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#include "mesh.hpp" -#include "mesh/mesh2D.hpp" - -#define bitRange 15 - -#if 0 - -/// THIS SECTION ------------------------------------------------------------------------------------> -// taken from: http://and-what-happened.blogspot.com/2011/08/fast-2d-and-3d-hilbert-curves-and.html - -unsigned int Morton_2D_Encode_16bit( unsigned int index1, unsigned int index2 ) -{ // pack 2 16-bit indices into a 32-bit Morton code - index1 &= 0x0000ffff; - index2 &= 0x0000ffff; - index1 |= ( index1 << 8 ); - index2 |= ( index2 << 8 ); - index1 &= 0x00ff00ff; - index2 &= 0x00ff00ff; - index1 |= ( index1 << 4 ); - index2 |= ( index2 << 4 ); - index1 &= 0x0f0f0f0f; - index2 &= 0x0f0f0f0f; - index1 |= ( index1 << 2 ); - index2 |= ( index2 << 2 ); - index1 &= 0x33333333; - index2 &= 0x33333333; - index1 |= ( index1 << 1 ); - index2 |= ( index2 << 1 ); - index1 &= 0x55555555; - index2 &= 0x55555555; - return( index1 | ( index2 << 1 ) ); -} - -unsigned int MortonToHilbert2D( const unsigned int morton, const unsigned int bits ) -{ - unsigned int hilbert = 0; - unsigned int remap = 0xb4; - unsigned int block = ( bits << 1 ); - while( block ) - { - block -= 2; - unsigned int mcode = ( ( morton >> block ) & 3 ); - unsigned int hcode = ( ( remap >> ( mcode << 1 ) ) & 3 ); - remap ^= ( 0x82000028 >> ( hcode << 3 ) ); - hilbert = ( ( hilbert << 2 ) + hcode ); - } - return( hilbert ); -} - - -unsigned int hilbert2D(unsigned int index1, unsigned int index2){ - - unsigned int morton = Morton_2D_Encode_16bit(index1,index2); - - return MortonToHilbert2D(morton, 16); -} - -/// THIS SECTION TO HERE <-------------------------------------------------------------------------------- - -// spread bits of i by introducing zeros between binary bits -unsigned long long int bitSplitter(unsigned int i){ - - unsigned long long int mask = 1; - unsigned long long int li = i; - unsigned long long int lj = 0; - - for(int b=0;b0; s/=2) { - rx = (x & s) > 0; - ry = (y & s) > 0; - d += s * s * ((3 * rx) ^ ry); - rot(s, &x, &y, rx, ry); - } - return d; -} - -#endif - -// capsule for element vertices + Morton index -typedef struct { - - unsigned long long int index; - - dlong element; - - int type; - - // 4 for maximum number of vertices per element in 2D - hlong v[4]; - - dfloat EX[4], EY[4]; - -}element_t; - -// compare the Morton indices for two element capsules -static int compareElements2D(const void *a, const void *b){ - - element_t *ea = (element_t*) a; - element_t *eb = (element_t*) b; - - if(ea->index < eb->index) return -1; - if(ea->index > eb->index) return 1; - - return 0; - -} - -// stub for the match function needed by parallelSort -static void bogusMatch(void *a, void *b){ } - -// geometric partition of elements in 2D mesh using Morton ordering + parallelSort -void mesh2D::GeometricPartition(){ - - dlong maxNelements; - MPI_Allreduce(&(Nelements), &maxNelements, 1, MPI_DLONG, MPI_MAX, comm); - maxNelements = 2*((maxNelements+1)/2); - - // fix maxNelements - element_t *elements - = (element_t*) calloc(maxNelements, sizeof(element_t)); - - // local bounding box of element centers - dfloat mincx = 1e9, maxcx = -1e9; - dfloat mincy = 1e9, maxcy = -1e9; - - // compute element centers on this process - for(dlong e=0;eindex < eb->index) return -1; - if(ea->index > eb->index) return 1; - - return 0; - -} - -// stub for the match function needed by parallelSort -static void bogusMatch3D(void *a, void *b){ } - -// geometric partition of elements in 3D mesh using Morton ordering + parallelSort -void mesh3D::GeometricPartition(){ - - dlong maxNelements; - MPI_Allreduce(&(Nelements), &maxNelements, 1, MPI_DLONG, MPI_MAX, - comm); - maxNelements = 2*((maxNelements+1)/2); - - // fix maxNelements - element_t *elements - = (element_t*) calloc(maxNelements, sizeof(element_t)); - - // local bounding box of element centers - dfloat minvx = 1e9, maxvx = -1e9; - dfloat minvy = 1e9, maxvy = -1e9; - dfloat minvz = 1e9, maxvz = -1e9; - - // compute element centers on this process - for(dlong n=0;n +using __gnu_parallel::sort; +#else +using std::sort; +#endif + +namespace libp { + typedef struct{ hlong gid; @@ -39,75 +48,96 @@ typedef struct{ // exchange of trace nodes void mesh_t::HaloRingSetup(){ - //make a global indexing of element Ids - hlong *globalOffsets = (hlong *) calloc(size+1,sizeof(hlong)); - hlong localNelements = (hlong) Nelements; + memory globalOffset(size+1, 0); //gather number of elements on each rank - MPI_Allgather(&localNelements, 1, MPI_HLONG, globalOffsets+1, 1, MPI_HLONG, comm); + hlong localNelements = Nelements; + comm.Allgather(localNelements, globalOffset+1); for(int rr=0;rr minRank(Ntotal); + memory maxRank(Ntotal); - //use the gs to find what nodes are local to this rank - dlong Ntotal = Np*Nelements; - int *minRank = (int *) calloc(Ntotal,sizeof(int)); - int *maxRank = (int *) calloc(Ntotal,sizeof(int)); for (dlong i=0;iGatherScatter(minRank, ogs_int, ogs_min, ogs_sym); //minRank[n] contains the smallest rank taking part in the gather of node n - ogs->GatherScatter(maxRank, ogs_int, ogs_max, ogs_sym); //maxRank[n] contains the largest rank taking part in the gather of node n + hlong gatherChange = 1; + + // keep comparing numbers on positive and negative traces until convergence + while(gatherChange>0){ + + // reset change counter + gatherChange = 0; + + // send halo data and recv into extension of buffer + halo.Exchange(minRank, Nverts); + halo.Exchange(maxRank, Nverts); + + // compare trace vertices + #pragma omp parallel for collapse(2) + for(dlong e=0;emaxRankM){ + gatherChange=1; + maxRank[idM] = maxRankP; + } + } + } + + // sum up changes + comm.Allreduce(gatherChange); + } //Make a list of the elements participating in the ring exchange //Count the number of shared vertices in the local mesh dlong NsendVerts=0; - for (int e=0;e vertexSendList(NsendVerts); + + memory vertexSendCounts(size, 0); + memory vertexRecvCounts(size); + memory vertexSendOffsets(size+1); + memory vertexRecvOffsets(size+1); NsendVerts=0; - for (int e=0;e vertexRecvList(NrecvVerts); // exchange shared vertices - MPI_Alltoallv(vertexSendList, vertexSendCounts, vertexSendOffsets, MPI_VERTEX_T, - vertexRecvList, vertexRecvCounts, vertexRecvOffsets, MPI_VERTEX_T, - comm); + comm.Alltoallv(vertexSendList, vertexSendCounts, vertexSendOffsets, + vertexRecvList, vertexRecvCounts, vertexRecvOffsets); // sort based on globalId to find matches - std::sort(vertexRecvList, vertexRecvList+NrecvVerts, + sort(vertexRecvList.ptr(), vertexRecvList.ptr()+NrecvVerts, [](const vertex_t& a, const vertex_t& b) {return a.gid < b.gid;}); @@ -157,8 +184,9 @@ void mesh_t::HaloRingSetup(){ } //Build offsets to unique vertice starts - dlong *vertexOffsets = (dlong*) calloc(Nunique+1, sizeof(dlong)); + memory vertexOffsets(Nunique+1); + vertexOffsets[0] = 0; Nunique=(NrecvVerts) ? 1:0; for(dlong n=1;n b.rank) return false; @@ -255,11 +277,11 @@ void mesh_t::HaloRingSetup(){ } //make a list of global element ids taking part in the halo exchange - hlong *globalElementId = (hlong *) malloc((Nelements+totalRingElements)*sizeof(hlong)); + memory globalElementId(Nelements+totalRingElements); //outgoing elements for(int e=0;e globalOffset(size+1, 0); //gather number of elements on each rank - MPI_Allgather(&localNelements, 1, MPI_HLONG, globalOffset+1, 1, MPI_HLONG, comm); + hlong localNelements = Nelements; + comm.Allgather(localNelements, globalOffset+1); for(int rr=0;rr(internalElementIds); + o_haloElementIds = platform.malloc(haloElementIds); + //make a list of global element ids taking part in the halo exchange - hlong *globalElementId = (hlong *) malloc((Nelements+totalHaloPairs)*sizeof(hlong)); + memory globalElementId(Nelements+totalHaloPairs); //outgoing elements for(int e=0;eExchange(EX, Nverts, ogs_dfloat); - halo->Exchange(EY, Nverts, ogs_dfloat); - if(dim==3) - halo->Exchange(EZ, Nverts, ogs_dfloat); + bool verbose = false; + halo.Setup(Nelements+totalHaloPairs, + globalElementId, comm, + ogs::Pairwise, verbose, platform); + } + +} //namespace libp + diff --git a/libs/mesh/meshHaloTraceSetup.cpp b/libs/mesh/meshHaloTraceSetup.cpp index 354251038..7045c4620 100644 --- a/libs/mesh/meshHaloTraceSetup.cpp +++ b/libs/mesh/meshHaloTraceSetup.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,29 +26,23 @@ SOFTWARE. #include "mesh.hpp" +namespace libp { + /* Set up trace halo infomation for inter-processor MPI exchange of trace nodes */ // Setup assumes field to be exchanged is Nelements*Nfields*Np in size // with Np being the fastest running index (hence each field entry is strided // Np apart) -halo_t* mesh_t::HaloTraceSetup(int Nfields){ - - hlong *globalOffsets = (hlong *) calloc(size+1,sizeof(hlong)); - hlong localNelements = (hlong) Nelements; - - //gather number of elements on each rank - MPI_Allgather(&localNelements, 1, MPI_HLONG, globalOffsets+1, 1, MPI_HLONG, comm); +ogs::halo_t mesh_t::HaloTraceSetup(int Nfields){ - for(int rr=0;rr globalids((Nelements+totalHaloPairs)*Np*Nfields); for (dlong e=0;eExchange(globalids, Np*Nfields, ogs_hlong); + halo.Exchange(globalids, Np*Nfields); //flag the trace ids we need for (dlong e=0;e& o_q, deviceMemory& o_Mq) { //compute Mq = M*q - MassMatrixKernel(Nelements, o_ggeo, o_MM, o_q, o_Mq); + MassMatrixKernel(Nelements, o_wJ, o_MM, o_q, o_Mq); } -void meshTri2D::MassMatrixKernelSetup(int Nfields) { - occa::properties kernelInfo = props; //copy base occa properties +void mesh_t::MassMatrixKernelSetupTri2D(int Nfields) { + properties_t kernelInfo = props; //copy base occa properties kernelInfo["defines/" "p_Nfields"]= Nfields; MassMatrixKernel = platform.buildKernel(MESH_DIR "/okl/MassMatrixOperatorTri2D.okl", @@ -42,8 +42,8 @@ void meshTri2D::MassMatrixKernelSetup(int Nfields) { kernelInfo); } -void meshQuad2D::MassMatrixKernelSetup(int Nfields) { - occa::properties kernelInfo = props; //copy base occa properties +void mesh_t::MassMatrixKernelSetupQuad2D(int Nfields) { + properties_t kernelInfo = props; //copy base occa properties kernelInfo["defines/" "p_Nfields"]= Nfields; MassMatrixKernel = platform.buildKernel(MESH_DIR "/okl/MassMatrixOperatorQuad2D.okl", @@ -51,8 +51,8 @@ void meshQuad2D::MassMatrixKernelSetup(int Nfields) { kernelInfo); } -void meshTet3D::MassMatrixKernelSetup(int Nfields) { - occa::properties kernelInfo = props; //copy base occa properties +void mesh_t::MassMatrixKernelSetupTet3D(int Nfields) { + properties_t kernelInfo = props; //copy base occa properties kernelInfo["defines/" "p_Nfields"]= Nfields; MassMatrixKernel = platform.buildKernel(MESH_DIR "/okl/MassMatrixOperatorTet3D.okl", @@ -60,8 +60,8 @@ void meshTet3D::MassMatrixKernelSetup(int Nfields) { kernelInfo); } -void meshHex3D::MassMatrixKernelSetup(int Nfields) { - occa::properties kernelInfo = props; //copy base occa properties +void mesh_t::MassMatrixKernelSetupHex3D(int Nfields) { + properties_t kernelInfo = props; //copy base occa properties kernelInfo["defines/" "p_Nfields"]= Nfields; MassMatrixKernel = platform.buildKernel(MESH_DIR "/okl/MassMatrixOperatorHex3D.okl", @@ -69,15 +69,4 @@ void meshHex3D::MassMatrixKernelSetup(int Nfields) { kernelInfo); } -void meshTri3D::MassMatrixKernelSetup(int Nfields) { - LIBP_ABORT("MassMatrixOperatorTri3D not implemented yet.") -} - -void meshQuad3D::MassMatrixKernelSetup(int Nfields) { - occa::properties kernelInfo = props; //copy base occa properties - kernelInfo["defines/" "p_Nfields"]= Nfields; - - MassMatrixKernel = platform.buildKernel(MESH_DIR "/okl/MassMatrixOperatorQuad2D.okl", - "MassMatrixOperatorQuad2D", - kernelInfo); -} \ No newline at end of file +} //namespace libp diff --git a/libs/mesh/meshMinCharacteristicLength.cpp b/libs/mesh/meshMinCharacteristicLength.cpp index 2217f3a7a..4d54cf79b 100644 --- a/libs/mesh/meshMinCharacteristicLength.cpp +++ b/libs/mesh/meshMinCharacteristicLength.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,8 +25,8 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh2D.hpp" -#include "mesh/mesh3D.hpp" + +namespace libp { dfloat mesh_t::MinCharacteristicLength(){ @@ -34,17 +34,15 @@ dfloat mesh_t::MinCharacteristicLength(){ for(dlong e=0;e::max(); for(int f=0;f::max(); @@ -80,12 +78,12 @@ dfloat meshQuad2D::ElementCharacteristicLength(dlong e) { // h = 1/(sJ/J) dfloat hest = J/sJ; - h = mymin(h, hest); + h = std::min(h, hest); } return h; } -dfloat meshTri3D::ElementCharacteristicLength(dlong e) { +dfloat mesh_t::ElementCharacteristicLengthTet3D(dlong e) { dfloat h = std::numeric_limits::max(); for(int f=0;f::max(); @@ -121,48 +119,9 @@ dfloat meshQuad3D::ElementCharacteristicLength(dlong e) { // h = 1/(sJ/J) dfloat hest = J/sJ; - h = mymin(h, hest); - } - return h; -} - -dfloat meshTet3D::ElementCharacteristicLength(dlong e) { - - dfloat h = std::numeric_limits::max(); - for(int f=0;f::max(); - - //sum weighted Jacobians to integrate over the element - dfloat J = 0.0; - for (int n=0;n mesh_t::MultiRateHaloTraceSetup(int Nfields){ - for(int rr=0;rr globalids((Nelements+totalHaloPairs)*Np*Nfields); for (dlong e=0;e traceIds((Nelements+totalHaloPairs) + *Nfp*Nfaces*Nfields); for (dlong e=0;eExchange(traceIds, Nfp*Nfaces*Nfields, ogs_hlong); + halo.Exchange(traceIds, Nfp*Nfaces*Nfields); //the halo region is filled, but there are duplicate IDs in the local section // bad news for the halo exchange, so remove them @@ -104,12 +97,17 @@ halo_t** mesh_t::MultiRateHaloTraceSetup(int Nfields){ } //make array of halo exchangers - halo_t** mrTraceHalo = (halo_t **) malloc(mrNlevels*sizeof(halo_t*)); + memory mrTraceHalo(mrNlevels); //make a global trace id array to be used for exchange on each multirate level - hlong *mrTraceIds = (hlong *) calloc((Nelements+totalHaloPairs) - *Nfp*Nfaces*Nfields,sizeof(hlong)); - memcpy(mrTraceIds, traceIds, Nelements*Nfp*Nfaces*Nfields*sizeof(hlong)); //copy local part + memory mrTraceIds((Nelements+totalHaloPairs) + *Nfp*Nfaces*Nfields); + mrTraceIds.copyFrom(traceIds, Nelements*Nfp*Nfaces*Nfields); //copy local part + + /*Zero halo region*/ + for (dlong n=0;n EToDT) { const int maxLevels = 100; //find global min and max dt - dfloat dtmin=1.e9, dtmax=0.0; - if (Nelements) { - dtmin = EToDT[0]; - dtmax = EToDT[0]; - } - for (dlong e=1;e::max(); + dfloat dtmax = std::numeric_limits::min(); + for (dlong e=0;e(std::floor(std::log2(dtmax/dtmin)))+1, + maxLevels); //compute the level of each element - mrLevel = (int *) calloc(Nelements+totalHaloPairs,sizeof(int)); + mrLevel.malloc(Nelements+totalHaloPairs); for(int lev=0; lev=dtlev) + if(EToDT[e] >=dtlev) { mrLevel[e] = lev; + } } } //enforce one level difference between neighbours for (int lev=0; lev < mrNlevels; lev++){ - halo->Exchange(mrLevel, 1, ogs_int); + halo.Exchange(mrLevel, 1); for (dlong e=0; e lev+1) { //find elements at least 2 levels higher than lev @@ -82,21 +82,20 @@ void mesh_t::MultiRateSetup(dfloat *EToDT) { //this could change the number of levels there are, so find the new max level mrNlevels = 0; for (dlong e=0;emrNlevels) ? mrLevel[e] : mrNlevels; + mrNlevels = std::max(mrLevel[e],mrNlevels); mrNlevels++; - int localNlevels = mrNlevels; - MPI_Allreduce(&localNlevels, &mrNlevels, 1, MPI_INT, MPI_MAX, comm); + comm.Allreduce(mrNlevels, Comm::Max); //construct element and halo lists // mrElements[lev] - list of all elements with multirate level <= lev // mrInterfaceElements[lev] - list of all elements with multirate level = lev, // with a neighbor of level lev-1 - mrNelements = (dlong *) calloc(mrNlevels,sizeof(dlong)); - mrInterfaceNelements = (dlong *) calloc(mrNlevels,sizeof(dlong)); + mrNelements.malloc(mrNlevels, 0); + mrInterfaceNelements.malloc(mrNlevels, 0); - mrElements = (dlong **) calloc(mrNlevels,sizeof(dlong*)); - mrInterfaceElements = (dlong **) calloc(mrNlevels,sizeof(dlong*)); + mrElements.malloc(mrNlevels); + mrInterfaceElements.malloc(mrNlevels); for (dlong e=0;e cnt(mrNlevels, 0); + memory cnt2(mrNlevels, 0); //fill element lists for (dlong e=0;e(Nelements, mrLevel); + o_mrNelements = platform.malloc(mrNlevels, mrNelements); + o_mrInterfaceNelements = platform.malloc(mrNlevels, mrInterfaceNelements); - o_mrElements = new occa::memory[mrNlevels]; - o_mrInterfaceElements = new occa::memory[mrNlevels]; + o_mrElements.malloc(mrNlevels); + o_mrInterfaceElements.malloc(mrNlevels); for (int lev =0;lev(mrNelements[lev], mrElements[lev]); if (mrInterfaceNelements[lev]) - o_mrInterfaceElements[lev] = platform.malloc(mrInterfaceNelements[lev]*sizeof(dlong), mrInterfaceElements[lev]); + o_mrInterfaceElements[lev] = platform.malloc(mrInterfaceNelements[lev], mrInterfaceElements[lev]); } if (rank==0){ @@ -163,15 +161,13 @@ void mesh_t::MultiRateSetup(dfloat *EToDT) { hlong Ntotal=0; for (int lev=0; levmesh_t::OccaSetup(); - - o_x = platform.malloc(Nelements*Np*sizeof(dfloat), x); - o_y = platform.malloc(Nelements*Np*sizeof(dfloat), y); - o_z = o_y; // dummy z variable - - props["defines/" "p_NXID"]= NXID; - props["defines/" "p_NYID"]= NYID; - props["defines/" "p_SJID"]= SJID; - props["defines/" "p_IJID"]= IJID; - props["defines/" "p_IHID"]= IHID; - props["defines/" "p_WIJID"]= WIJID; - props["defines/" "p_WSJID"]= WSJID; - - props["defines/" "p_G00ID"]= G00ID; - props["defines/" "p_G01ID"]= G01ID; - props["defines/" "p_G11ID"]= G11ID; - props["defines/" "p_GWJID"]= GWJID; - - props["defines/" "p_RXID"]= RXID; - props["defines/" "p_SXID"]= SXID; - props["defines/" "p_RYID"]= RYID; - props["defines/" "p_SYID"]= SYID; - - props["defines/" "p_JID"]= JID; - props["defines/" "p_JWID"]= JWID; - props["defines/" "p_IJWID"]= IJWID; - -} diff --git a/libs/mesh/meshOccaSetup3D.cpp b/libs/mesh/meshOccaSetup3D.cpp deleted file mode 100644 index b0586c46c..000000000 --- a/libs/mesh/meshOccaSetup3D.cpp +++ /dev/null @@ -1,77 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#include "mesh.hpp" -#include "mesh/mesh3D.hpp" - -void mesh3D::OccaSetup(){ - - this->mesh_t::OccaSetup(); - - o_x = platform.malloc(Nelements*Np*sizeof(dfloat), x); - o_y = platform.malloc(Nelements*Np*sizeof(dfloat), y); - o_z = platform.malloc(Nelements*Np*sizeof(dfloat), z); - - props["defines/" "p_NXID"]= NXID; - props["defines/" "p_NYID"]= NYID; - props["defines/" "p_NZID"]= NZID; - props["defines/" "p_SJID"]= SJID; - props["defines/" "p_IJID"]= IJID; - props["defines/" "p_IHID"]= IHID; - props["defines/" "p_WSJID"]= WSJID; - props["defines/" "p_WIJID"]= WIJID; - props["defines/" "p_STXID"]= STXID; - props["defines/" "p_STYID"]= STYID; - props["defines/" "p_STZID"]= STZID; - props["defines/" "p_SBXID"]= SBXID; - props["defines/" "p_SBYID"]= SBYID; - props["defines/" "p_SBZID"]= SBZID; - - props["defines/" "p_G00ID"]= G00ID; - props["defines/" "p_G01ID"]= G01ID; - props["defines/" "p_G02ID"]= G02ID; - props["defines/" "p_G11ID"]= G11ID; - props["defines/" "p_G12ID"]= G12ID; - props["defines/" "p_G22ID"]= G22ID; - props["defines/" "p_GWJID"]= GWJID; - - - props["defines/" "p_RXID"]= RXID; - props["defines/" "p_SXID"]= SXID; - props["defines/" "p_TXID"]= TXID; - - props["defines/" "p_RYID"]= RYID; - props["defines/" "p_SYID"]= SYID; - props["defines/" "p_TYID"]= TYID; - - props["defines/" "p_RZID"]= RZID; - props["defines/" "p_SZID"]= SZID; - props["defines/" "p_TZID"]= TZID; - - props["defines/" "p_JID"]= JID; - props["defines/" "p_JWID"]= JWID; - props["defines/" "p_IJWID"]= IJWID; -} diff --git a/libs/mesh/meshOccaSetupHex3D.cpp b/libs/mesh/meshOccaSetupHex3D.cpp deleted file mode 100644 index acfb681b8..000000000 --- a/libs/mesh/meshOccaSetupHex3D.cpp +++ /dev/null @@ -1,88 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#include "mesh.hpp" -#include "mesh/mesh3D.hpp" - -void meshHex3D::OccaSetup(){ - - this->mesh3D::OccaSetup(); - - o_D = platform.malloc(Nq*Nq*sizeof(dfloat), D); - - o_S = o_D; //dummy - o_MM = o_D; //dummy - o_sM = o_D; //dummy - o_LIFT = o_D; //dummy - - o_vgeo = platform.malloc((Nelements+totalHaloPairs)*Nvgeo*Np*sizeof(dfloat), vgeo); - o_sgeo = platform.malloc(Nelements*Nfaces*Nfp*Nsgeo*sizeof(dfloat), sgeo); - o_ggeo = platform.malloc(Nelements*Np*Nggeo*sizeof(dfloat), ggeo); - - /* NC: disabling until we re-add treatment of affine elements - - // build trilinear geometric factors for hexes - if(settings.compareSetting("ELEMENT MAP", "AFFINE")){ - // pack gllz, gllw, and elementwise EXYZ - hlong Nxyz = Nelements*dim*Nverts; - EXYZ = (dfloat*) calloc(Nxyz, sizeof(dfloat)); - gllzw = (dfloat*) calloc(2*Nq, sizeof(dfloat)); - - int sk = 0; - for(int n=0;nmesh3D::OccaSetup(); - - // build transposes (we hold matrices as column major on device) - dfloat *DT = (dfloat*) calloc(3*Np*Np, sizeof(dfloat)); - dfloat *DrT = DT + 0*Np*Np; - dfloat *DsT = DT + 1*Np*Np; - dfloat *DtT = DT + 2*Np*Np; - matrixTranspose(Np, Np, Dr, Np, DrT, Np); - matrixTranspose(Np, Np, Ds, Np, DsT, Np); - matrixTranspose(Np, Np, Dt, Np, DtT, Np); - - dfloat *LIFTT = (dfloat*) calloc(Np*Nfaces*Nfp, sizeof(dfloat)); - matrixTranspose(Np, Nfp*Nfaces, LIFT, Nfp*Nfaces, LIFTT, Np); - - dfloat *sMT = (dfloat *) calloc(Np*Nfaces*Nfp,sizeof(dfloat)); - matrixTranspose(Np, Nfp*Nfaces, sM, Nfp*Nfaces, sMT, Np); - - dfloat *ST = (dfloat*) calloc(6*Np*Np, sizeof(dfloat)); - dfloat *SrrT = ST + 0*Np*Np; - dfloat *SrsT = ST + 1*Np*Np; - dfloat *SrtT = ST + 2*Np*Np; - dfloat *SssT = ST + 3*Np*Np; - dfloat *SstT = ST + 4*Np*Np; - dfloat *SttT = ST + 5*Np*Np; - matrixTranspose(Np, Np, Srr, Np, SrrT, Np); - matrixTranspose(Np, Np, Srs, Np, SrsT, Np); - matrixTranspose(Np, Np, Srt, Np, SrtT, Np); - matrixTranspose(Np, Np, Sss, Np, SssT, Np); - matrixTranspose(Np, Np, Sst, Np, SstT, Np); - matrixTranspose(Np, Np, Stt, Np, SttT, Np); - - o_D = platform.malloc(3*Np*Np*sizeof(dfloat), DT); - o_MM = platform.malloc(Np*Np*sizeof(dfloat), MM); //MM is symmetric - - o_sM = platform.malloc(Np*Nfaces*Nfp*sizeof(dfloat), sMT); - - o_LIFT = platform.malloc(Np*Nfaces*Nfp*sizeof(dfloat), LIFTT); - - o_S = platform.malloc(6*Np*Np*sizeof(dfloat), ST); - - o_vgeo = platform.malloc((Nelements+totalHaloPairs)*Nvgeo*sizeof(dfloat), vgeo); - o_sgeo = platform.malloc(Nelements*Nfaces*Nsgeo*sizeof(dfloat), sgeo); - o_ggeo = platform.malloc(Nelements*Nggeo*sizeof(dfloat), ggeo); - - free(DT); - free(LIFTT); - free(sMT); - free(ST); -} diff --git a/libs/mesh/meshOccaSetupTri2D.cpp b/libs/mesh/meshOccaSetupTri2D.cpp deleted file mode 100644 index 249d1e8c2..000000000 --- a/libs/mesh/meshOccaSetupTri2D.cpp +++ /dev/null @@ -1,72 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#include "mesh.hpp" -#include "mesh/mesh2D.hpp" - -void meshTri2D::OccaSetup(){ - - this->mesh2D::OccaSetup(); - - // build transposes (we hold matrices as column major on device) - dfloat *DT = (dfloat*) calloc(2*Np*Np, sizeof(dfloat)); - dfloat *DrT = DT + 0*Np*Np; - dfloat *DsT = DT + 1*Np*Np; - matrixTranspose(Np, Np, Dr, Np, DrT, Np); - matrixTranspose(Np, Np, Ds, Np, DsT, Np); - - dfloat *LIFTT = (dfloat*) calloc(Np*Nfaces*Nfp, sizeof(dfloat)); - matrixTranspose(Np, Nfp*Nfaces, LIFT, Nfp*Nfaces, LIFTT, Np); - - dfloat *sMT = (dfloat *) calloc(Np*Nfaces*Nfp,sizeof(dfloat)); - matrixTranspose(Np, Nfp*Nfaces, sM, Nfp*Nfaces, sMT, Np); - - dfloat *ST = (dfloat*) calloc(3*Np*Np, sizeof(dfloat)); - dfloat *SrrT = ST + 0*Np*Np; - dfloat *SrsT = ST + 1*Np*Np; - dfloat *SssT = ST + 2*Np*Np; - matrixTranspose(Np, Np, Srr, Np, SrrT, Np); - matrixTranspose(Np, Np, Srs, Np, SrsT, Np); - matrixTranspose(Np, Np, Sss, Np, SssT, Np); - - o_D = platform.malloc(2*Np*Np*sizeof(dfloat), DT); - o_MM = platform.malloc(Np*Np*sizeof(dfloat), MM); //MM is symmetric - - o_sM = platform.malloc(Np*Nfaces*Nfp*sizeof(dfloat), sMT); - - o_LIFT = platform.malloc(Np*Nfaces*Nfp*sizeof(dfloat), LIFTT); - - o_S = platform.malloc(3*Np*Np*sizeof(dfloat), ST); - - o_vgeo = platform.malloc((Nelements+totalHaloPairs)*Nvgeo*sizeof(dfloat), vgeo); - o_sgeo = platform.malloc(Nelements*Nfaces*Nsgeo*sizeof(dfloat), sgeo); - o_ggeo = platform.malloc(Nelements*Nggeo*sizeof(dfloat), ggeo); - - free(DT); - free(LIFTT); - free(sMT); - free(ST); -} diff --git a/libs/mesh/meshOccaSetupTri3D.cpp b/libs/mesh/meshOccaSetupTri3D.cpp deleted file mode 100644 index 4985b3089..000000000 --- a/libs/mesh/meshOccaSetupTri3D.cpp +++ /dev/null @@ -1,72 +0,0 @@ -/* - - The MIT License (MIT) - - Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in all - copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - -*/ - -#include "mesh.hpp" -#include "mesh/mesh3D.hpp" - -void meshTri3D::OccaSetup(){ - - this->mesh3D::OccaSetup(); - - // build transposes (we hold matrices as column major on device) - dfloat *DT = (dfloat*) calloc(2*Np*Np, sizeof(dfloat)); - dfloat *DrT = DT + 0*Np*Np; - dfloat *DsT = DT + 1*Np*Np; - matrixTranspose(Np, Np, Dr, Np, DrT, Np); - matrixTranspose(Np, Np, Ds, Np, DsT, Np); - - dfloat *LIFTT = (dfloat*) calloc(Np*Nfaces*Nfp, sizeof(dfloat)); - matrixTranspose(Np, Nfp*Nfaces, LIFT, Nfp*Nfaces, LIFTT, Np); - - dfloat *sMT = (dfloat *) calloc(Np*Nfaces*Nfp,sizeof(dfloat)); - matrixTranspose(Np, Nfp*Nfaces, sM, Nfp*Nfaces, sMT, Np); - - dfloat *ST = (dfloat*) calloc(3*Np*Np, sizeof(dfloat)); - dfloat *SrrT = ST + 0*Np*Np; - dfloat *SrsT = ST + 1*Np*Np; - dfloat *SssT = ST + 2*Np*Np; - matrixTranspose(Np, Np, Srr, Np, SrrT, Np); - matrixTranspose(Np, Np, Srs, Np, SrsT, Np); - matrixTranspose(Np, Np, Sss, Np, SssT, Np); - - o_D = platform.malloc(2*Np*Np*sizeof(dfloat), DT); - o_MM = platform.malloc(Np*Np*sizeof(dfloat), MM); //MM is symmetric - - o_sM = platform.malloc(Np*Nfaces*Nfp*sizeof(dfloat), sMT); - - o_LIFT = platform.malloc(Np*Nfaces*Nfp*sizeof(dfloat), LIFTT); - - o_S = platform.malloc(3*Np*Np*sizeof(dfloat), ST); - - o_vgeo = platform.malloc((Nelements+totalHaloPairs)*Nvgeo*sizeof(dfloat), vgeo); - o_sgeo = platform.malloc(Nelements*Nfaces*Nsgeo*sizeof(dfloat), sgeo); - o_ggeo = platform.malloc(Nelements*Nggeo*sizeof(dfloat), ggeo); - - free(DT); - free(LIFTT); - free(sMT); - free(ST); -} diff --git a/libs/mesh/meshParallelConnectNodes.cpp b/libs/mesh/meshParallelConnectNodes.cpp deleted file mode 100644 index 5343e4a2c..000000000 --- a/libs/mesh/meshParallelConnectNodes.cpp +++ /dev/null @@ -1,110 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#include "mesh.hpp" - - -// uniquely label each node with a global index, used for gatherScatter -void mesh_t::ParallelConnectNodes(){ - - hlong localNodeCount = Np*Nelements; - hlong *allLocalNodeCounts = (hlong*) calloc(size, sizeof(hlong)); - - MPI_Allgather(&localNodeCount, 1, MPI_HLONG, - allLocalNodeCounts, 1, MPI_HLONG, - comm); - - hlong gatherNodeStart = 0; - for(int rr=0;rrvirtual gather) - int *baseRank = (int *) malloc((totalHaloPairs+Nelements)*Np*sizeof(int)); - globalIds = (hlong *) malloc((totalHaloPairs+Nelements)*Np*sizeof(hlong)); - - // use local numbering - for(dlong e=0;e0){ - - // reset change counter - localChange = 0; - - // send halo data and recv into extension of buffer - halo->Exchange(baseRank, Np, ogs_int); - halo->Exchange(globalIds, Np, ogs_hlong); - - // compare trace nodes - for(dlong e=0;eConnect(); - - // count # of elements to send to each rank based on - // minimum {vertex id % size} - int *Nsend = (int*) calloc(size, sizeof(int)); - int *Nrecv = (int*) calloc(size, sizeof(int)); - int *sendOffsets = (int*) calloc(size, sizeof(int)); - int *recvOffsets = (int*) calloc(size, sizeof(int)); - - // WARNING: In some corner cases, the number of faces to send may overrun int storage - int allNsend = 0; - for(dlong e=0;e()); - - sendFaces[id].rank = rank; - - sendFaces[id].elementN = -1; - sendFaces[id].faceN = -1; - sendFaces[id].rankN = -1; - - ++Nsend[destRank]; - } - } - } - - // exchange byte counts - MPI_Alltoall(Nsend, 1, MPI_INT, - Nrecv, 1, MPI_INT, - comm); - - // count incoming faces - int allNrecv = 0; - for(int rr=0;rr b.rank) return false; - - if(a.element < b.element) return true; - if(a.element > b.element) return false; - - return (a.face < b.face); - }); - - // send faces back from whence they came - MPI_Alltoallv(recvFaces, Nrecv, recvOffsets, MPI_PARALLELFACE_T, - sendFaces, Nsend, sendOffsets, MPI_PARALLELFACE_T, - comm); - - // extract connectivity info - EToP = (int*) calloc(Nelements*Nfaces, sizeof(int)); - for(dlong cnt=0;cnt=0 && f>=0 && eN>=0 && fN>=0){ - EToE[e*Nfaces+f] = eN; - EToF[e*Nfaces+f] = fN; - EToP[e*Nfaces+f] = rN; - } - } - - MPI_Barrier(comm); - MPI_Type_free(&MPI_PARALLELFACE_T); - free(sendFaces); - free(recvFaces); - - //record the number of elements in the whole mesh - hlong NelementsLocal = (hlong) Nelements; - NelementsGlobal = 0; - MPI_Allreduce(&NelementsLocal, &NelementsGlobal, 1, MPI_HLONG, MPI_SUM, comm); -} diff --git a/libs/mesh/meshParallelReaderQuad3D.cpp b/libs/mesh/meshParallelReaderQuad3D.cpp deleted file mode 100644 index 963f4efab..000000000 --- a/libs/mesh/meshParallelReaderQuad3D.cpp +++ /dev/null @@ -1,227 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#include "mesh.hpp" -#include "mesh/mesh3D.hpp" - -/* - purpose: read gmsh quadrilateral mesh -*/ -void meshQuad3D::ParallelReader(const char *fileName){ - - FILE *fp = fopen(fileName, "r"); - - dim = 3; - Nverts = 4; // number of vertices per element - Nfaces = 4; - NfaceVertices = 2; - - int faceVertices_[4][2] = {{0,1},{1,2},{2,3},{3,0}}; - - faceVertices = - (int*) calloc(NfaceVertices*Nfaces, sizeof(int)); - - memcpy(faceVertices, faceVertices_[0], NfaceVertices*Nfaces*sizeof(int)); - - if(fp==NULL){ - stringstream ss; - ss << "Cannot open file: " << fileName; - LIBP_ABORT(ss.str()) - } - - char buf[BUFSIZ]; - do{ - if (!fgets(buf, BUFSIZ, fp)) { //read to end of line - stringstream ss; - ss << "Error reading mesh file: " << fileName; - LIBP_ABORT(ss.str()) - } - }while(!strstr(buf, "$Nodes")); - - /* read number of nodes in mesh */ - if (!fgets(buf, BUFSIZ, fp)) { //read to end of line - stringstream ss; - ss << "Error reading mesh file: " << fileName; - LIBP_ABORT(ss.str()) - } - sscanf(buf, hlongFormat, &(Nnodes)); - - /* allocate space for node coordinates */ - dfloat *VX = (dfloat*) calloc(Nnodes, sizeof(dfloat)); - dfloat *VY = (dfloat*) calloc(Nnodes, sizeof(dfloat)); - dfloat *VZ = (dfloat*) calloc(Nnodes, sizeof(dfloat)); - - /* load nodes */ - for(int n=0;nExchange(x, Np, ogs_dfloat); - halo->Exchange(y, Np, ogs_dfloat); - halo->Exchange(z, Np, ogs_dfloat); + o_x = platform.malloc(x); + o_y = platform.malloc(y); + o_z = platform.malloc(z); } + +} //namespace libp diff --git a/libs/mesh/meshPhysicalNodesQuad2D.cpp b/libs/mesh/meshPhysicalNodesQuad2D.cpp index 82a05210c..af6fc3b25 100644 --- a/libs/mesh/meshPhysicalNodesQuad2D.cpp +++ b/libs/mesh/meshPhysicalNodesQuad2D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,15 +25,15 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh2D.hpp" -void meshQuad2D::PhysicalNodes(){ +namespace libp { - x = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat)); - y = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat)); - z = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat)); +void mesh_t::PhysicalNodesQuad2D(){ - dlong cnt = 0; + x.malloc(Nelements*Np); + y.malloc(Nelements*Np); + + #pragma omp parallel for for(dlong e=0;eExchange(x, Np, ogs_dfloat); - halo->Exchange(y, Np, ogs_dfloat); + o_x = platform.malloc(x); + o_y = platform.malloc(y); } + +} //namespace libp diff --git a/libs/mesh/meshPhysicalNodesQuad3D.cpp b/libs/mesh/meshPhysicalNodesQuad3D.cpp index 295ce6308..c6785a8ea 100644 --- a/libs/mesh/meshPhysicalNodesQuad3D.cpp +++ b/libs/mesh/meshPhysicalNodesQuad3D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,15 +25,16 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh3D.hpp" -void meshQuad3D::PhysicalNodes(){ +namespace libp { - x = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat)); - y = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat)); - z = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat)); +void mesh_t::PhysicalNodesQuad3D(){ - int cnt = 0; + x.malloc(Nelements*Np); + y.malloc(Nelements*Np); + z.malloc(Nelements*Np); + + #pragma omp parallel for for(int e=0;eExchange(x, Np, ogs_dfloat); - halo->Exchange(y, Np, ogs_dfloat); - halo->Exchange(z, Np, ogs_dfloat); + o_x = platform.malloc(x); + o_y = platform.malloc(y); + o_z = platform.malloc(z); } + +} //namespace libp diff --git a/libs/mesh/meshPhysicalNodesTet3D.cpp b/libs/mesh/meshPhysicalNodesTet3D.cpp index e9cc68443..5cc2d7d30 100644 --- a/libs/mesh/meshPhysicalNodesTet3D.cpp +++ b/libs/mesh/meshPhysicalNodesTet3D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,15 +25,16 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh3D.hpp" -void meshTet3D::PhysicalNodes(){ +namespace libp { - x = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat)); - y = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat)); - z = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat)); +void mesh_t::PhysicalNodesTet3D(){ - dlong cnt = 0; + x.malloc(Nelements*Np); + y.malloc(Nelements*Np); + z.malloc(Nelements*Np); + + #pragma omp parallel for for(dlong e=0;eExchange(x, Np, ogs_dfloat); - halo->Exchange(y, Np, ogs_dfloat); - halo->Exchange(z, Np, ogs_dfloat); + o_x = platform.malloc(x); + o_y = platform.malloc(y); + o_z = platform.malloc(z); } + +} //namespace libp diff --git a/libs/mesh/meshPhysicalNodesTri2D.cpp b/libs/mesh/meshPhysicalNodesTri2D.cpp index b8877b1d0..39a2a4587 100644 --- a/libs/mesh/meshPhysicalNodesTri2D.cpp +++ b/libs/mesh/meshPhysicalNodesTri2D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,15 +25,15 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh2D.hpp" -void meshTri2D::PhysicalNodes(){ +namespace libp { - x = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat)); - y = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat)); - z = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat)); // dummy +void mesh_t::PhysicalNodesTri2D(){ - dlong cnt = 0; + x.malloc(Nelements*Np); + y.malloc(Nelements*Np); + + #pragma omp parallel for for(dlong e=0;eExchange(x, Np, ogs_dfloat); - halo->Exchange(y, Np, ogs_dfloat); + o_x = platform.malloc(x); + o_y = platform.malloc(y); } + +} //namespace libp diff --git a/libs/mesh/meshPhysicalNodesTri3D.cpp b/libs/mesh/meshPhysicalNodesTri3D.cpp index 720a74258..700f07ba4 100644 --- a/libs/mesh/meshPhysicalNodesTri3D.cpp +++ b/libs/mesh/meshPhysicalNodesTri3D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,15 +25,16 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh3D.hpp" -void meshTri3D::PhysicalNodes(){ +namespace libp { - x = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat)); - y = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat)); - z = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat)); +void mesh_t::PhysicalNodesTri3D(){ - int cnt = 0; + x.malloc(Nelements*Np); + y.malloc(Nelements*Np); + z.malloc(Nelements*Np); + + #pragma omp parallel for for(int e=0;eExchange(x, Np, ogs_dfloat); - halo->Exchange(y, Np, ogs_dfloat); - halo->Exchange(z, Np, ogs_dfloat); + o_x = platform.malloc(x); + o_y = platform.malloc(y); + o_z = platform.malloc(z); } + +} //namespace libp diff --git a/libs/mesh/meshPlotInterpHex3D.cpp b/libs/mesh/meshPlotInterpHex3D.cpp index 6d064f002..a394accc9 100644 --- a/libs/mesh/meshPlotInterpHex3D.cpp +++ b/libs/mesh/meshPlotInterpHex3D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,24 +25,20 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh3D.hpp" + +namespace libp { //interpolate field to plotting nodes -void meshHex3D::PlotInterp(const dfloat* q, dfloat* Iq, dfloat* scratch){ - - dfloat *IQ, *IIQ; - - bool alloc_scratch=false; - if (scratch==nullptr) { - //if not provided with a scratch space, alloc our own - alloc_scratch=true; - IQ = (dfloat *) malloc(plotNq*Nq*Nq*sizeof(dfloat)); - IIQ = (dfloat *) malloc(plotNq*plotNq*Nq*sizeof(dfloat)); - } else { - IQ = scratch; - IIQ = scratch + plotNq*Nq*Nq; +void mesh_t::PlotInterpHex3D(const memory q, memory Iq, memory scratch){ + + if (scratch.length()< static_cast(plotNq*Nq*Nq + plotNq*plotNq*Nq)) { + //if not provided with enough scratch space, alloc our own + scratch.malloc(plotNq*Nq*Nq + plotNq*plotNq*Nq); } + memory IQ = scratch; + memory IIQ = scratch + plotNq*Nq*Nq; + //interpolate in r for(int k=0;k q, memory Iq, memory scratch){ - bool alloc_scratch=false; - if (scratch==nullptr) { - //if not provided with a scratch space, alloc our own - alloc_scratch=true; - IQ = (dfloat *) malloc(plotNq*Nq*sizeof(dfloat)); - } else { - IQ = scratch; + if (scratch.length()< static_cast(plotNq*Nq)) { + //if not provided with enough scratch space, alloc our own + scratch.malloc(plotNq*Nq); } + memory IQ = scratch; //interpolate in r for(int j=0;j q, memory Iq, memory scratch){ //interpolate for(int n=0;n q, memory Iq, memory scratch){ //interpolate for(int n=0;n(pmlElements); + o_pmlIds = platform.malloc(pmlIds); + o_nonPmlElements = platform.malloc(nonPmlElements); } void mesh_t::MultiRatePmlSetup(){ - mrNnonPmlElements = (dlong *) calloc(mrNlevels,sizeof(dlong)); - mrNpmlElements = (dlong *) calloc(mrNlevels,sizeof(dlong)); + mrNnonPmlElements.malloc(mrNlevels, 0); + mrNpmlElements.malloc(mrNlevels, 0); //count PML elements for (dlong e=0;e(mrPmlElements[lev]); + o_mrPmlIds[lev] = platform.malloc(mrPmlIds[lev]); + o_mrNonPmlElements[lev] = platform.malloc(mrNonPmlElements[lev]); } -} \ No newline at end of file +} + +} //namespace libp diff --git a/libs/mesh/meshParallelReaderHex3D.cpp b/libs/mesh/meshReadGmshHex3D.cpp similarity index 60% rename from libs/mesh/meshParallelReaderHex3D.cpp rename to libs/mesh/meshReadGmshHex3D.cpp index 498f797f8..ad09785fa 100644 --- a/libs/mesh/meshParallelReaderHex3D.cpp +++ b/libs/mesh/meshReadGmshHex3D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,84 +25,57 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh3D.hpp" + +namespace libp { /* purpose: read gmsh hexrahedra mesh */ -void meshHex3D::ParallelReader(const char *fileName){ - - FILE *fp = fopen(fileName, "r"); - - dim = 3; - Nverts = 8; // number of vertices per element - Nfaces = 6; - NfaceVertices = 4; - - // vertices on each face - int _faceVertices[6][4] = {{0,1,2,3},{0,1,5,4},{1,2,6,5},{2,3,7,6},{3,0,4,7},{4,5,6,7}}; - - faceVertices = - (int*) calloc(NfaceVertices*Nfaces, sizeof(int)); +void mesh_t::ReadGmshHex3D(const std::string fileName){ - memcpy(faceVertices, _faceVertices[0], NfaceVertices*Nfaces*sizeof(int)); - - if(fp==NULL){ - stringstream ss; - ss << "Cannot open file: " << fileName; - LIBP_ABORT(ss.str()) - } + FILE *fp = fopen(fileName.c_str(), "r"); + LIBP_ABORT("Cannot open file: " << fileName, + fp==NULL); char buf[BUFSIZ]; do{ - if (!fgets(buf, BUFSIZ, fp)) { //read to end of line - stringstream ss; - ss << "Error reading mesh file: " << fileName; - LIBP_ABORT(ss.str()) - } + //read to end of line + LIBP_ABORT("Error reading mesh file: " << fileName, + !fgets(buf, BUFSIZ, fp)); }while(!strstr(buf, "$Nodes")); /* read number of nodes in mesh */ - if (!fgets(buf, BUFSIZ, fp)) { //read to end of line - stringstream ss; - ss << "Error reading mesh file: " << fileName; - LIBP_ABORT(ss.str()) - } + //read to end of line + LIBP_ABORT("Error reading mesh file: " << fileName, + !fgets(buf, BUFSIZ, fp)); sscanf(buf, hlongFormat, &(Nnodes)); /* allocate space for node coordinates */ - dfloat *VX = (dfloat*) calloc(Nnodes, sizeof(dfloat)); - dfloat *VY = (dfloat*) calloc(Nnodes, sizeof(dfloat)); - dfloat *VZ = (dfloat*) calloc(Nnodes, sizeof(dfloat)); + memory VX(Nnodes); + memory VY(Nnodes); + memory VZ(Nnodes); /* load nodes */ for(hlong n=0;n VX(Nnodes); + memory VY(Nnodes); /* load nodes */ for(hlong n=0;n VX(Nnodes); + memory VY(Nnodes); + memory VZ(Nnodes); + + /* load nodes */ + for(int n=0;n VX(Nnodes); + memory VY(Nnodes); + memory VZ(Nnodes); /* load nodes */ for(hlong n=0;n VX(Nnodes); + memory VY(Nnodes); /* load nodes */ for(hlong n=0;n VX(Nnodes); + memory VY(Nnodes); + memory VZ(Nnodes); + + /* load nodes */ + for(int n=0;n(D); /* Plotting data */ - plotN = N_ + 3; //enriched interpolation space for plotting + plotN = N + 3; //enriched interpolation space for plotting plotNq = plotN + 1; plotNp = plotNq*plotNq*plotNq; /* Plotting nodes */ - plotR = (dfloat *) malloc(plotNp*sizeof(dfloat)); - plotS = (dfloat *) malloc(plotNp*sizeof(dfloat)); - plotT = (dfloat *) malloc(plotNp*sizeof(dfloat)); EquispacedNodesHex3D(plotN, plotR, plotS, plotT); plotNelements = 6*plotN*plotN*plotN; plotNverts = 4; - plotEToV = (int*) malloc(plotNelements*plotNverts*sizeof(int)); EquispacedEToVHex3D(plotN, plotEToV); - dfloat *plot1D = (dfloat *) malloc(plotNq*sizeof(dfloat)); + memory plot1D; EquispacedNodes1D(plotN, plot1D); + InterpolationMatrix1D(N, gllz, plot1D, plotInterp); - plotInterp = (dfloat *) malloc(Nq*plotNq*sizeof(dfloat)); - InterpolationMatrix1D(N, Nq, gllz, plotNq, plot1D, plotInterp); - - free(gllz); - free(plot1D); + props["defines/" "p_N"]= N; + props["defines/" "p_Nq"]= Nq; + props["defines/" "p_Np"]= Np; + props["defines/" "p_Nfp"]= Nfp; + props["defines/" "p_NfacesNfp"]= Nfp*Nfaces; } +} //namespace libp diff --git a/libs/mesh/meshReferenceNodesQuad2D.cpp b/libs/mesh/meshReferenceNodesQuad2D.cpp index cb56703b9..46a0ae1db 100644 --- a/libs/mesh/meshReferenceNodesQuad2D.cpp +++ b/libs/mesh/meshReferenceNodesQuad2D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,70 +25,52 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh2D.hpp" -#include "mesh/mesh3D.hpp" -void meshQuad3D::ReferenceNodes(int N_){ - mesh_t *mesh_p = (mesh_t*) this; - meshQuad2D* quadmesh = (meshQuad2D*) mesh_p; - quadmesh->meshQuad2D::ReferenceNodes(N_); -} +namespace libp { -void meshQuad2D::ReferenceNodes(int N_){ +void mesh_t::ReferenceNodesQuad2D(){ - N = N_; - Nfp = N+1; Nq = (N+1); + Nfp = N+1; Np = (N+1)*(N+1); /* Nodal Data */ - r = (dfloat *) malloc(Np*sizeof(dfloat)); - s = (dfloat *) malloc(Np*sizeof(dfloat)); NodesQuad2D(N, r, s); - - faceNodes = (int *) malloc(Nfaces*Nfp*sizeof(int)); FaceNodesQuad2D(N, r, s, faceNodes); - - vertexNodes = (int*) calloc(Nverts, sizeof(int)); VertexNodesQuad2D(N, r, s, vertexNodes); //GLL quadrature - dfloat *gllz = (dfloat *) malloc((N+1)*sizeof(dfloat)); - w = (dfloat *) malloc((N+1)*sizeof(dfloat)); - JacobiGLL(N, gllz, w); + JacobiGLL(N, gllz, gllw); //Lumped Mass matrix - MM = (dfloat *) malloc(Np*Np*sizeof(dfloat)); - invMM = (dfloat *) malloc(Np*Np*sizeof(dfloat)); - LumpedMassMatrixQuad2D(N, w, MM); - invLumpedMassMatrixQuad2D(N, w, invMM); + LumpedMassMatrixQuad2D(N, gllw, MM); + invLumpedMassMatrixQuad2D(N, gllw, invMM); // D matrix - D = (dfloat *) malloc(Nq*Nq*sizeof(dfloat)); - Dmatrix1D(N, Nq, gllz, Nq, gllz, D); + Dmatrix1D(N, gllz, gllz, D); + o_D = platform.malloc(D); /* Plotting data */ - plotN = N_ + 3; //enriched interpolation space for plotting + plotN = N + 3; //enriched interpolation space for plotting plotNq = plotN + 1; plotNp = plotNq*plotNq; /* Plotting nodes */ - plotR = (dfloat *) malloc(plotNp*sizeof(dfloat)); - plotS = (dfloat *) malloc(plotNp*sizeof(dfloat)); EquispacedNodesQuad2D(plotN, plotR, plotS); plotNelements = 2*plotN*plotN; plotNverts = 3; - plotEToV = (int*) malloc(plotNelements*plotNverts*sizeof(int)); EquispacedEToVQuad2D(plotN, plotEToV); - dfloat *plot1D = (dfloat *) malloc(plotNq*sizeof(dfloat)); + memory plot1D; EquispacedNodes1D(plotN, plot1D); + InterpolationMatrix1D(N, gllz, plot1D, plotInterp); - plotInterp = (dfloat *) malloc(Nq*plotNq*sizeof(dfloat)); - InterpolationMatrix1D(N, Nq, gllz, plotNq, plot1D, plotInterp); - - free(gllz); - free(plot1D); + props["defines/" "p_N"]= N; + props["defines/" "p_Nq"]= Nq; + props["defines/" "p_Np"]= Np; + props["defines/" "p_Nfp"]= Nfp; + props["defines/" "p_NfacesNfp"]= Nfp*Nfaces; } +}//namespace libp diff --git a/libs/mesh/meshReferenceNodesTet3D.cpp b/libs/mesh/meshReferenceNodesTet3D.cpp index e85321969..8c8372c9c 100644 --- a/libs/mesh/meshReferenceNodesTet3D.cpp +++ b/libs/mesh/meshReferenceNodesTet3D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,76 +25,95 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh3D.hpp" -void meshTet3D::ReferenceNodes(int N_){ +namespace libp { + +void mesh_t::ReferenceNodesTet3D(){ - N = N_; Nfp = ((N+1)*(N+2))/2; Np = ((N+1)*(N+2)*(N+3))/6; /* Nodal Data */ - r = (dfloat *) malloc(Np*sizeof(dfloat)); - s = (dfloat *) malloc(Np*sizeof(dfloat)); - t = (dfloat *) malloc(Np*sizeof(dfloat)); NodesTet3D(N, r, s, t); - - faceNodes = (int *) malloc(Nfaces*Nfp*sizeof(int)); FaceNodesTet3D(N, r, s, t, faceNodes); - - vertexNodes = (int*) calloc(Nverts, sizeof(int)); VertexNodesTet3D(N, r, s, t, vertexNodes); - dfloat *V = (dfloat *) malloc(Np*Np*sizeof(dfloat)); - VandermondeTet3D(N, Np, r, s, t, V); + memory V; + VandermondeTet3D(N, r, s, t, V); //Mass matrix - MM = (dfloat *) malloc(Np*Np*sizeof(dfloat)); - invMM = (dfloat *) malloc(Np*Np*sizeof(dfloat)); MassMatrixTet3D(Np, V, MM); invMassMatrixTet3D(Np, V, invMM); - free(V); + o_MM = platform.malloc(MM); //MM is symmetric //packed D matrices - D = (dfloat *) malloc(3*Np*Np*sizeof(dfloat)); + DmatrixTet3D(N, r, s, t, D); Dr = D + 0*Np*Np; Ds = D + 1*Np*Np; Dt = D + 2*Np*Np; - DmatrixTet3D(N, Np, r, s, t, Dr, Ds, Dt); - LIFT = (dfloat *) malloc(Np*Nfaces*Nfp*sizeof(dfloat)); - LIFTmatrixTet3D(N, faceNodes, r, s, t, LIFT); + memory DT(3*Np*Np); + memory DrT = DT + 0*Np*Np; + memory DsT = DT + 1*Np*Np; + memory DtT = DT + 2*Np*Np; + linAlg_t::matrixTranspose(Np, Np, Dr, Np, DrT, Np); + linAlg_t::matrixTranspose(Np, Np, Ds, Np, DsT, Np); + linAlg_t::matrixTranspose(Np, Np, Dt, Np, DtT, Np); + o_D = platform.malloc(DT); - sM = (dfloat *) calloc(Np*Nfaces*Nfp,sizeof(dfloat)); + LIFTmatrixTet3D(N, faceNodes, r, s, t, LIFT); SurfaceMassMatrixTet3D(N, MM, LIFT, sM); + memory LIFTT(Np*Nfaces*Nfp); + linAlg_t::matrixTranspose(Np, Nfp*Nfaces, LIFT, Nfp*Nfaces, LIFTT, Np); + + memory sMT(Np*Nfaces*Nfp); + linAlg_t::matrixTranspose(Np, Nfp*Nfaces, sM, Nfp*Nfaces, sMT, Np); + + o_sM = platform.malloc(sMT); + o_LIFT = platform.malloc(LIFTT); + //packed stiffness matrices - S = (dfloat*) calloc(6*Np*Np, sizeof(dfloat)); + SmatrixTet3D(N, Dr, Ds, Dt, MM, S); Srr = S + 0*Np*Np; Srs = S + 1*Np*Np; Srt = S + 2*Np*Np; Sss = S + 3*Np*Np; Sst = S + 4*Np*Np; Stt = S + 5*Np*Np; - SmatrixTet3D(N, Dr, Ds, Dt, MM, Srr, Srs, Srt, Sss, Sst, Stt); + + memory ST(6*Np*Np); + memory SrrT = ST + 0*Np*Np; + memory SrsT = ST + 1*Np*Np; + memory SrtT = ST + 2*Np*Np; + memory SssT = ST + 3*Np*Np; + memory SstT = ST + 4*Np*Np; + memory SttT = ST + 5*Np*Np; + linAlg_t::matrixTranspose(Np, Np, Srr, Np, SrrT, Np); + linAlg_t::matrixTranspose(Np, Np, Srs, Np, SrsT, Np); + linAlg_t::matrixTranspose(Np, Np, Srt, Np, SrtT, Np); + linAlg_t::matrixTranspose(Np, Np, Sss, Np, SssT, Np); + linAlg_t::matrixTranspose(Np, Np, Sst, Np, SstT, Np); + linAlg_t::matrixTranspose(Np, Np, Stt, Np, SttT, Np); + + o_S = platform.malloc(ST); /* Plotting data */ plotN = N + 3; //enriched interpolation space for plotting plotNp = (plotN+1)*(plotN+2)*(plotN+3)/6; /* Plotting nodes */ - plotR = (dfloat *) malloc(plotNp*sizeof(dfloat)); - plotS = (dfloat *) malloc(plotNp*sizeof(dfloat)); - plotT = (dfloat *) malloc(plotNp*sizeof(dfloat)); EquispacedNodesTet3D(plotN, plotR, plotS, plotT); plotNelements = plotN*plotN*plotN; plotNverts = 4; - plotEToV = (int*) malloc(plotNelements*plotNverts*sizeof(int)); EquispacedEToVTet3D(plotN, plotEToV); + InterpolationMatrixTet3D(N, r, s, t, plotR, plotS, plotT, plotInterp); - plotInterp = (dfloat *) malloc(Np*plotNp*sizeof(dfloat)); - InterpolationMatrixTet3D(N, Np, r, s, t, plotNp, plotR, plotS, plotT, plotInterp); + props["defines/" "p_N"]= N; + props["defines/" "p_Np"]= Np; + props["defines/" "p_Nfp"]= Nfp; + props["defines/" "p_NfacesNfp"]= Nfp*Nfaces; } - +} //namespace libp diff --git a/libs/mesh/meshReferenceNodesTri2D.cpp b/libs/mesh/meshReferenceNodesTri2D.cpp index b8e6d5770..26b673137 100644 --- a/libs/mesh/meshReferenceNodesTri2D.cpp +++ b/libs/mesh/meshReferenceNodesTri2D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,75 +25,83 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh2D.hpp" -#include "mesh/mesh3D.hpp" -void meshTri3D::ReferenceNodes(int N_){ - mesh_t *mesh_p = (mesh_t*) this; - meshTri2D* trimesh = (meshTri2D*) mesh_p; - trimesh->meshTri2D::ReferenceNodes(N); -} +namespace libp { -void meshTri2D::ReferenceNodes(int N_){ +void mesh_t::ReferenceNodesTri2D(){ - N = N_; Nfp = N+1; Np = (N+1)*(N+2)/2; /* Nodal Data */ - r = (dfloat *) malloc(Np*sizeof(dfloat)); - s = (dfloat *) malloc(Np*sizeof(dfloat)); NodesTri2D(N, r, s); - - faceNodes = (int *) malloc(Nfaces*Nfp*sizeof(int)); FaceNodesTri2D(N, r, s, faceNodes); - - vertexNodes = (int*) calloc(Nverts, sizeof(int)); VertexNodesTri2D(N, r, s, vertexNodes); - dfloat *V = (dfloat *) malloc(Np*Np*sizeof(dfloat)); - VandermondeTri2D(N, Np, r, s, V); + memory V; + VandermondeTri2D(N, r, s, V); //Mass matrix - MM = (dfloat *) malloc(Np*Np*sizeof(dfloat)); - invMM = (dfloat *) malloc(Np*Np*sizeof(dfloat)); MassMatrixTri2D(Np, V, MM); invMassMatrixTri2D(Np, V, invMM); - free(V); + o_MM = platform.malloc(MM); //MM is symmetric //packed D matrices - D = (dfloat *) malloc(2*Np*Np*sizeof(dfloat)); + DmatrixTri2D(N, r, s, D); Dr = D + 0*Np*Np; Ds = D + 1*Np*Np; - DmatrixTri2D(N, Np, r, s, Dr, Ds); - LIFT = (dfloat *) malloc(Np*Nfaces*Nfp*sizeof(dfloat)); - LIFTmatrixTri2D(N, faceNodes, r, s, LIFT); + memory DT(2*Np*Np); + memory DrT = DT + 0*Np*Np; + memory DsT = DT + 1*Np*Np; + linAlg_t::matrixTranspose(Np, Np, Dr, Np, DrT, Np); + linAlg_t::matrixTranspose(Np, Np, Ds, Np, DsT, Np); + o_D = platform.malloc(DT); - sM = (dfloat *) calloc(Np*Nfaces*Nfp,sizeof(dfloat)); + LIFTmatrixTri2D(N, faceNodes, r, s, LIFT); SurfaceMassMatrixTri2D(N, MM, LIFT, sM); + memory LIFTT(Np*Nfaces*Nfp); + linAlg_t::matrixTranspose(Np, Nfp*Nfaces, LIFT, Nfp*Nfaces, LIFTT, Np); + + memory sMT(Np*Nfaces*Nfp); + linAlg_t::matrixTranspose(Np, Nfp*Nfaces, sM, Nfp*Nfaces, sMT, Np); + + o_sM = platform.malloc(sMT); + o_LIFT = platform.malloc(LIFTT); + //packed stiffness matrices - S = (dfloat*) calloc(3*Np*Np, sizeof(dfloat)); + SmatrixTri2D(N, Dr, Ds, MM, S); Srr = S + 0*Np*Np; Srs = S + 1*Np*Np; Sss = S + 2*Np*Np; - SmatrixTri2D(N, Dr, Ds, MM, Srr, Srs, Sss); + + memory ST(3*Np*Np); + memory SrrT = ST + 0*Np*Np; + memory SrsT = ST + 1*Np*Np; + memory SssT = ST + 2*Np*Np; + linAlg_t::matrixTranspose(Np, Np, Srr, Np, SrrT, Np); + linAlg_t::matrixTranspose(Np, Np, Srs, Np, SrsT, Np); + linAlg_t::matrixTranspose(Np, Np, Sss, Np, SssT, Np); + + o_S = platform.malloc(ST); /* Plotting data */ plotN = N + 3; //enriched interpolation space for plotting plotNp = (plotN+1)*(plotN+2)/2; /* Plotting nodes */ - plotR = (dfloat *) malloc(plotNp*sizeof(dfloat)); - plotS = (dfloat *) malloc(plotNp*sizeof(dfloat)); EquispacedNodesTri2D(plotN, plotR, plotS); plotNelements = plotN*plotN; plotNverts = 3; - plotEToV = (int*) malloc(plotNelements*plotNverts*sizeof(int)); EquispacedEToVTri2D(plotN, plotEToV); + InterpolationMatrixTri2D(N, r, s, plotR, plotS, plotInterp); - plotInterp = (dfloat *) malloc(Np*plotNp*sizeof(dfloat)); - InterpolationMatrixTri2D(N, Np, r, s, plotNp, plotR, plotS, plotInterp); + props["defines/" "p_N"]= N; + props["defines/" "p_Np"]= Np; + props["defines/" "p_Nfp"]= Nfp; + props["defines/" "p_NfacesNfp"]= Nfp*Nfaces; } + +} //namespace libp diff --git a/libs/mesh/meshSetElementType.cpp b/libs/mesh/meshSetElementType.cpp new file mode 100644 index 000000000..861f7130f --- /dev/null +++ b/libs/mesh/meshSetElementType.cpp @@ -0,0 +1,90 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "mesh.hpp" + +namespace libp { + +void mesh_t::SetElementType(const Mesh::ElementType eType) { + + if (eType==Mesh::TRIANGLES) { + elementType = Mesh::TRIANGLES; + + Nverts = 3; // number of vertices per element + Nfaces = 3; // number of faces per element + NfaceVertices = 2; // number of vertices per face + + // vertices on each face + int _faceVertices[4][2] = {{0,1},{1,2},{2,0}}; + + faceVertices.malloc(NfaceVertices*Nfaces); + faceVertices.copyFrom(_faceVertices[0]); + + } else if (eType==Mesh::QUADRILATERALS) { + elementType = Mesh::QUADRILATERALS; + + Nverts = 4; // number of vertices per element + Nfaces = 4; // number of faces per element + NfaceVertices = 2; // number of vertices per face + + // vertices on each face + int _faceVertices[4][2] = {{0,1},{1,2},{2,3},{3,0}}; + + faceVertices.malloc(NfaceVertices*Nfaces); + faceVertices.copyFrom(_faceVertices[0]); + + } else if (eType==Mesh::TETRAHEDRA) { + elementType = Mesh::TETRAHEDRA; + + Nverts = 4; // number of vertices per element + Nfaces = 4; // number of faces per element + NfaceVertices = 3; // number of vertices per face + + // vertices on each face + int _faceVertices[4][3] = {{0,1,2},{0,3,1},{1,3,2},{0,2,3}}; + + faceVertices.malloc(NfaceVertices*Nfaces); + faceVertices.copyFrom(_faceVertices[0]); + + } else if (eType==Mesh::HEXAHEDRA) { + elementType = Mesh::HEXAHEDRA; + + Nverts = 8; // number of vertices per element + Nfaces = 6; // number of faces per element + NfaceVertices = 4; // number of vertices per face + + // vertices on each face + int _faceVertices[6][4] = + {{0,1,2,3},{0,4,5,1},{1,5,6,2},{2,6,7,3},{0,3,7,4},{4,7,6,5}}; + + faceVertices.malloc(NfaceVertices*Nfaces); + faceVertices.copyFrom(_faceVertices[0]); + } else { + LIBP_FORCE_ABORT("Unknown element type: " << eType); + } +} + +} //namespace libp diff --git a/libs/mesh/meshSettings.cpp b/libs/mesh/meshSettings.cpp index 70f4a61d9..c8e5fa396 100644 --- a/libs/mesh/meshSettings.cpp +++ b/libs/mesh/meshSettings.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,8 +25,11 @@ SOFTWARE. */ #include "mesh.hpp" +#include "parAdogs.hpp" -meshSettings_t::meshSettings_t(MPI_Comm& _comm): +namespace libp { + +meshSettings_t::meshSettings_t(comm_t _comm): settings_t(_comm) { newSetting("MESH FILE", @@ -83,14 +86,13 @@ meshSettings_t::meshSettings_t(MPI_Comm& _comm): "4", "Degree of polynomial finite element space", {"1","2","3","4","5","6","7","8","9","10","11","12","13","14","15"}); + + paradogs::AddSettings(*this); } void meshSettings_t::report() { - int rank; - MPI_Comm_rank(comm, &rank); - - if (rank==0) { + if (comm.rank()==0) { std::cout << "Mesh Settings:\n\n"; if (!compareSetting("MESH FILE","BOX")) reportSetting("MESH FILE"); @@ -127,5 +129,11 @@ void meshSettings_t::report() { } reportSetting("POLYNOMIAL DEGREE"); + + if (!compareSetting("MESH FILE","BOX")) { + paradogs::ReportSettings(*this); + } } } + +} //namespace libp diff --git a/libs/mesh/meshSetup.cpp b/libs/mesh/meshSetup.cpp index bb01929c1..479c75b47 100644 --- a/libs/mesh/meshSetup.cpp +++ b/libs/mesh/meshSetup.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,94 +25,80 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh2D.hpp" -#include "mesh/mesh3D.hpp" -mesh_t& mesh_t::Setup(platform_t& platform, meshSettings_t& settings, - MPI_Comm comm){ +namespace libp { - string fileName; - int N, dim, elementType; +void mesh_t::Setup(platform_t& _platform, meshSettings_t& _settings, + comm_t _comm){ - settings.getSetting("MESH FILE", fileName); - settings.getSetting("POLYNOMIAL DEGREE", N); - settings.getSetting("ELEMENT TYPE", elementType); + platform = _platform; + settings = _settings; + props = platform.props(); + + comm = _comm.Dup(); + rank = comm.rank(); + size = comm.size(); + + int eType=0; + settings.getSetting("ELEMENT TYPE", eType); settings.getSetting("MESH DIMENSION", dim); - mesh_t *mesh=NULL; - switch(elementType){ - case TRIANGLES: - if(dim==2) - mesh = new meshTri2D(platform, settings, comm); - else - mesh = new meshTri3D(platform, settings, comm); - break; - case QUADRILATERALS: - if(dim==2) - mesh = new meshQuad2D(platform, settings, comm); - else - mesh = new meshQuad3D(platform, settings, comm); - break; - case TETRAHEDRA: - mesh = new meshTet3D(platform, settings, comm); - break; - case HEXAHEDRA: - mesh = new meshHex3D(platform, settings, comm); - break; - } - mesh->elementType = elementType; + SetElementType(Mesh::ElementType(eType)); - mesh->ringHalo = NULL; + props["defines/" "p_dim"]= dim; + props["defines/" "p_Nfaces"]= Nfaces; + + std::string fileName; + settings.getSetting("MESH FILE", fileName); if (settings.compareSetting("MESH FILE","PMLBOX")) { //build a box mesh with a pml layer - mesh->SetupPmlBox(); + SetupPmlBox(); } else if (settings.compareSetting("MESH FILE","BOX")) { //build a box mesh - mesh->SetupBox(); + SetupBox(); } else { // read chunk of elements from file - mesh->ParallelReader(fileName.c_str()); + ReadGmsh(fileName); - // partition elements using Morton ordering & parallel sort - mesh->GeometricPartition(); + // partition elements using parAdogs + Partition(); } - // connect elements using parallel sort - mesh->ParallelConnect(); + // load reference (r,s) element nodes + settings.getSetting("POLYNOMIAL DEGREE", N); + ReferenceNodes(); - // print out connectivity statistics - mesh->PrintPartitionStatistics(); + // connect elements + Connect(); // connect elements to boundary faces - mesh->ConnectBoundary(); - - // load reference (r,s) element nodes - mesh->ReferenceNodes(N); + ConnectBoundary(); // set up halo exchange info for MPI (do before connect face nodes) - mesh->HaloSetup(); + HaloSetup(); - // compute physical (x,y) locations of the element nodes - mesh->PhysicalNodes(); + // connect face vertices + ConnectFaceVertices(); - // compute geometric factors - mesh->GeometricFactors(); + // connect face nodes + ConnectFaceNodes(); - // connect face nodes (find trace indices) - mesh->ConnectFaceNodes(); - - // compute surface geofacs - mesh->SurfaceGeometricFactors(); + // make global indexing + ConnectNodes(); - // make a global indexing - mesh->ParallelConnectNodes(); + // compute physical (x,y) locations of the element nodes + PhysicalNodes(); - // make an ogs operator and label local/global gather elements - mesh->ParallelGatherScatterSetup(); + // compute geometric factors + GeometricFactors(); - mesh->OccaSetup(); + // compute surface geofacs + SurfaceGeometricFactors(); - return *mesh; + // label local/global gather elements + GatherScatterSetup(); } + +} //namespace libp diff --git a/libs/mesh/meshSetupBoxHex3D.cpp b/libs/mesh/meshSetupBoxHex3D.cpp index 946c4c90b..eefe551c1 100644 --- a/libs/mesh/meshSetupBoxHex3D.cpp +++ b/libs/mesh/meshSetupBoxHex3D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,34 +25,21 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh3D.hpp" -void meshHex3D::SetupBox(){ +namespace libp { - dim = 3; - Nverts = 8; // number of vertices per element - Nfaces = 6; - NfaceVertices = 4; - - // vertices on each face - int _faceVertices[6][4] = - {{0,1,2,3},{0,1,5,4},{1,2,6,5},{2,3,7,6},{3,0,4,7},{4,5,6,7}}; - - faceVertices = - (int*) calloc(NfaceVertices*Nfaces, sizeof(int)); - - memcpy(faceVertices, _faceVertices[0], NfaceVertices*Nfaces*sizeof(int)); +void mesh_t::SetupBoxHex3D(){ // find a factorization size = size_x*size_y*size_z such that // size_x>=size_y>=size_z are all 'close' to one another int size_x, size_y, size_z; - factor3(size, size_x, size_y, size_z); + Factor3(size, size_x, size_y, size_z); - //find our coordinates in the MPI grid such that - // rank = rank_x + rank_y*size_x + rank_z*size_x*size_y - int rank_z = rank/(size_x*size_y); - int rank_y = (rank-rank_z*size_x*size_y)/size_x; - int rank_x = rank % size_x; + //determine (x,y,z) rank coordinates for this processes + int rank_x=-1, rank_y=-1, rank_z=-1; + RankDecomp3(size_x, size_y, size_z, + rank_x, rank_y, rank_z, + rank); //get global size from settings dlong NX, NY, NZ; @@ -97,9 +84,9 @@ void meshHex3D::SetupBox(){ dfloat dy = DIMY/NY; dfloat dz = DIMZ/NZ; - dlong offset_x = rank_x*(NX/size_x) + mymin(rank_x, (NX % size_x)); - dlong offset_y = rank_y*(NY/size_y) + mymin(rank_y, (NY % size_y)); - dlong offset_z = rank_z*(NZ/size_z) + mymin(rank_z, (NZ % size_z)); + dlong offset_x = rank_x*(NX/size_x) + std::min(rank_x, (NX % size_x)); + dlong offset_y = rank_y*(NY/size_y) + std::min(rank_y, (NY % size_y)); + dlong offset_z = rank_z*(NZ/size_z) + std::min(rank_z, (NZ % size_z)); //bottom corner of physical domain dfloat X0 = -DIMX/2.0 + offset_x*dx; @@ -115,18 +102,20 @@ void meshHex3D::SetupBox(){ Nnodes = NnX*NnY*NnZ; //global node count Nelements = nx*ny*nz; //local - EToV = (hlong*) calloc(Nelements*Nverts, sizeof(hlong)); - EX = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat)); - EY = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat)); - EZ = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat)); + EToV.malloc(Nelements*Nverts); + EX.malloc(Nelements*Nverts); + EY.malloc(Nelements*Nverts); + EZ.malloc(Nelements*Nverts); - elementInfo = (hlong*) calloc(Nelements, sizeof(hlong)); + elementInfo.malloc(Nelements); - dlong e = 0; + #pragma omp parallel for collapse(3) for(int k=0;k=size_y and are 'close' to one another int size_x, size_y; - factor2(size, size_x, size_y); + Factor2(size, size_x, size_y); - //find our coordinates in the MPI grid such that - // rank = rank_x + rank_y*size_x - int rank_y = rank / size_x; - int rank_x = rank % size_x; + //determine (x,y) rank coordinates for this processes + int rank_x=-1, rank_y=-1; + RankDecomp2(size_x, size_y, + rank_x, rank_y, + rank); //get global size from settings dlong NX, NY; @@ -91,8 +77,8 @@ void meshQuad2D::SetupBox(){ dfloat dx = DIMX/NX; dfloat dy = DIMY/NY; - dlong offset_x = rank_x*(NX/size_x) + mymin(rank_x, (NX % size_x)); - dlong offset_y = rank_y*(NY/size_y) + mymin(rank_y, (NY % size_y)); + dlong offset_x = rank_x*(NX/size_x) + std::min(rank_x, (NX % size_x)); + dlong offset_y = rank_y*(NY/size_y) + std::min(rank_y, (NY % size_y)); //bottom corner of physical domain dfloat X0 = -DIMX/2.0 + offset_x*dx; @@ -106,16 +92,18 @@ void meshQuad2D::SetupBox(){ Nnodes = NnX*NnY; //global node count Nelements = nx*ny; //local - EToV = (hlong*) calloc(Nelements*Nverts, sizeof(hlong)); - EX = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat)); - EY = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat)); + EToV.malloc(Nelements*Nverts); + EX.malloc(Nelements*Nverts); + EY.malloc(Nelements*Nverts); - elementInfo = (hlong*) calloc(Nelements, sizeof(hlong)); + elementInfo.malloc(Nelements); - dlong e = 0; + #pragma omp parallel for collapse(2) for(int j=0;j=size_y>=size_z are all 'close' to one another int size_x, size_y, size_z; - factor3(size, size_x, size_y, size_z); + Factor3(size, size_x, size_y, size_z); - //find our coordinates in the MPI grid such that - // rank = rank_x + rank_y*size_x + rank_z*size_x*size_y - int rank_z = rank/(size_x*size_y); - int rank_y = (rank-rank_z*size_x*size_y)/size_x; - int rank_x = rank % size_x; + //determine (x,y,z) rank coordinates for this processes + int rank_x=-1, rank_y=-1, rank_z=-1; + RankDecomp3(size_x, size_y, size_z, + rank_x, rank_y, rank_z, + rank); //get global size from settings dlong NX, NY, NZ; @@ -94,9 +84,9 @@ void meshTet3D::SetupBox(){ dfloat dy = DIMY/NY; dfloat dz = DIMZ/NZ; - dlong offset_x = rank_x*(NX/size_x) + mymin(rank_x, (NX % size_x)); - dlong offset_y = rank_y*(NY/size_y) + mymin(rank_y, (NY % size_y)); - dlong offset_z = rank_z*(NZ/size_z) + mymin(rank_z, (NZ % size_z)); + dlong offset_x = rank_x*(NX/size_x) + std::min(rank_x, (NX % size_x)); + dlong offset_y = rank_y*(NY/size_y) + std::min(rank_y, (NY % size_y)); + dlong offset_z = rank_z*(NZ/size_z) + std::min(rank_z, (NZ % size_z)); //bottom corner of physical domain dfloat X0 = -DIMX/2.0 + offset_x*dx; @@ -112,18 +102,20 @@ void meshTet3D::SetupBox(){ Nnodes = NnX*NnY*NnZ; //global node count Nelements = 6*nx*ny*nz; //local element count (each cube divided into 6 tets) - EToV = (hlong*) calloc(Nelements*Nverts, sizeof(hlong)); - EX = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat)); - EY = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat)); - EZ = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat)); + EToV.malloc(Nelements*Nverts); + EX.malloc(Nelements*Nverts); + EY.malloc(Nelements*Nverts); + EZ.malloc(Nelements*Nverts); - elementInfo = (hlong*) calloc(Nelements, sizeof(hlong)); + elementInfo.malloc(Nelements); - dlong e = 0; + #pragma omp parallel for collapse(3) for(int k=0;k=size_y and are 'close' to one another int size_x, size_y; - factor2(size, size_x, size_y); + Factor2(size, size_x, size_y); - //find our coordinates in the MPI grid such that - // rank = rank_x + rank_y*size_x - int rank_y = rank / size_x; - int rank_x = rank % size_x; + //determine (x,y) rank coordinates for this processes + int rank_x=-1, rank_y=-1; + RankDecomp2(size_x, size_y, + rank_x, rank_y, + rank); //get global size from settings dlong NX, NY; @@ -91,8 +77,8 @@ void meshTri2D::SetupBox(){ dfloat dx = DIMX/NX; dfloat dy = DIMY/NY; - dlong offset_x = rank_x*(NX/size_x) + mymin(rank_x, (NX % size_x)); - dlong offset_y = rank_y*(NY/size_y) + mymin(rank_y, (NY % size_y)); + dlong offset_x = rank_x*(NX/size_x) + std::min(rank_x, (NX % size_x)); + dlong offset_y = rank_y*(NY/size_y) + std::min(rank_y, (NY % size_y)); //bottom corner of physical domain dfloat X0 = -DIMX/2.0 + offset_x*dx; @@ -106,16 +92,18 @@ void meshTri2D::SetupBox(){ Nnodes = NnX*NnY; //global node count Nelements = 2*nx*ny; //local - EToV = (hlong*) calloc(Nelements*Nverts, sizeof(hlong)); - EX = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat)); - EY = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat)); + EToV.malloc(Nelements*Nverts); + EX.malloc(Nelements*Nverts); + EY.malloc(Nelements*Nverts); - elementInfo = (hlong*) calloc(Nelements, sizeof(hlong)); + elementInfo.malloc(Nelements); - dlong e = 0; + #pragma omp parallel for collapse(2) for(int j=0;jdim = dim; - mesh->Nverts = Nverts; - mesh->Nfaces = Nfaces; - mesh->NfaceVertices = NfaceVertices; - mesh->faceVertices = faceVertices; - - mesh->elementType = elementType; - - mesh->Nnodes = Nnodes; - mesh->EX = EX; // coordinates of vertices for each element - mesh->EY = EY; - mesh->EZ = EZ; - - mesh->Nelements = Nelements; - mesh->NelementsGlobal = NelementsGlobal; - mesh->EToV = EToV; // element-to-vertex connectivity - mesh->EToE = EToE; // element-to-element connectivity - mesh->EToF = EToF; // element-to-(local)face connectivity - mesh->EToP = EToP; // element-to-partition/process connectivity - mesh->EToB = EToB; // element-to-boundary condition type - - mesh->elementInfo = elementInfo; - - mesh->NboundaryFaces = NboundaryFaces; - mesh->boundaryInfo = boundaryInfo; - - mesh->halo = halo; - mesh->NinternalElements = NinternalElements; - mesh->NhaloElements = NhaloElements; - mesh->totalHaloPairs = totalHaloPairs; - mesh->internalElementIds = internalElementIds; - mesh->haloElementIds = haloElementIds; - mesh->o_internalElementIds = o_internalElementIds; - mesh->o_haloElementIds = o_haloElementIds; - - mesh->ogs = ogs; - mesh->globalIds = globalIds; - - mesh->NglobalGatherElements = NglobalGatherElements; - mesh->globalGatherElementList = globalGatherElementList; - mesh->o_globalGatherElementList = o_globalGatherElementList; - - mesh->NlocalGatherElements = NlocalGatherElements; - mesh->localGatherElementList = localGatherElementList; - mesh->o_localGatherElementList = o_localGatherElementList; + if (Nf==N) return mesh; + + mesh.N = Nf; // load reference (r,s) element nodes - mesh->ReferenceNodes(Nf); + mesh.ReferenceNodes(); + + // connect face nodes (find trace indices) + mesh.ConnectFaceNodes(); + + // make a global indexing + mesh.ConnectNodes(); // compute physical (x,y) locations of the element nodes - mesh->PhysicalNodes(); + mesh.PhysicalNodes(); // compute geometric factors - mesh->GeometricFactors(); - - // connect face nodes (find trace indices) - mesh->ConnectFaceNodes(); + mesh.GeometricFactors(); // compute surface geofacs - mesh->SurfaceGeometricFactors(); - - // make a global indexing - mesh->ParallelConnectNodes(); - - // make an ogs operator and label local/global gather elements - mesh->ParallelGatherScatterSetup(); + mesh.SurfaceGeometricFactors(); - mesh->OccaSetup(); + // label local/global gather elements + mesh.GatherScatterSetup(); - return *mesh; + return mesh; } + +} //namespace libp diff --git a/libs/mesh/meshSetupPmlBoxHex3D.cpp b/libs/mesh/meshSetupPmlBoxHex3D.cpp index 1b9149b02..348f5e580 100644 --- a/libs/mesh/meshSetupPmlBoxHex3D.cpp +++ b/libs/mesh/meshSetupPmlBoxHex3D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,39 +25,26 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh3D.hpp" + +namespace libp { static void addHex(hlong i0, hlong j0, hlong k0, hlong NnX, hlong NnY, hlong NnZ, dfloat x0, dfloat y0, dfloat z0, dfloat dx, dfloat dy, dfloat dz, - hlong *EToV, dfloat *EX, dfloat *EY, dfloat *EZ, - hlong *elementInfo, int type, dlong &e); - -void meshHex3D::SetupPmlBox(){ - - dim = 3; - Nverts = 8; // number of vertices per element - Nfaces = 6; - NfaceVertices = 4; - - // vertices on each face - int _faceVertices[6][4] = - {{0,1,2,3},{0,1,5,4},{1,2,6,5},{2,3,7,6},{3,0,4,7},{4,5,6,7}}; + memory EToV, memory EX, memory EY, memory EZ, + memory elementInfo, int type, dlong &e); - faceVertices = - (int*) calloc(NfaceVertices*Nfaces, sizeof(int)); - - memcpy(faceVertices, _faceVertices[0], NfaceVertices*Nfaces*sizeof(int)); +void mesh_t::SetupPmlBoxHex3D(){ // find a factorization size = size_x*size_y*size_z such that // size_x>=size_y>=size_z are all 'close' to one another int size_x, size_y, size_z; - factor3(size, size_x, size_y, size_z); + Factor3(size, size_x, size_y, size_z); - //find our coordinates in the MPI grid such that - // rank = rank_x + rank_y*size_x + rank_z*size_x*size_y - int rank_z = rank/(size_x*size_y); - int rank_y = (rank-rank_z*size_x*size_y)/size_x; - int rank_x = rank % size_x; + //determine (x,y,z) rank coordinates for this processes + int rank_x=-1, rank_y=-1, rank_z=-1; + RankDecomp3(size_x, size_y, size_z, + rank_x, rank_y, rank_z, + rank); //get global size from settings dlong NX, NY, NZ; @@ -90,8 +77,8 @@ void meshHex3D::SetupPmlBox(){ settings.getSetting("BOX BOUNDARY FLAG", boundaryFlag); const int periodicFlag = (boundaryFlag == -1) ? 1 : 0; - if (periodicFlag) - LIBP_ABORT(string("Periodic boundary unsupported for PMLBOX mesh.")) + LIBP_ABORT("Periodic boundary unsupported for PMLBOX mesh.", + periodicFlag); //local grid physical sizes dfloat DIMX, DIMY, DIMZ; @@ -107,9 +94,9 @@ void meshHex3D::SetupPmlBox(){ dfloat dy = DIMY/NY; dfloat dz = DIMZ/NZ; - dlong offset_x = rank_x*(NX/size_x) + mymin(rank_x, (NX % size_x)); - dlong offset_y = rank_y*(NY/size_y) + mymin(rank_y, (NY % size_y)); - dlong offset_z = rank_z*(NZ/size_z) + mymin(rank_z, (NZ % size_z)); + dlong offset_x = rank_x*(NX/size_x) + std::min(rank_x, (NX % size_x)); + dlong offset_y = rank_y*(NY/size_y) + std::min(rank_y, (NY % size_y)); + dlong offset_z = rank_z*(NZ/size_z) + std::min(rank_z, (NZ % size_z)); //local grid physical sizes dfloat dimx = nx*dx; @@ -183,12 +170,12 @@ void meshHex3D::SetupPmlBox(){ if (rank_x==size_x-1 && rank_y==size_y-1 && rank_z==size_z-1) Nelements+=pmlNx*pmlNy*pmlNz; - EToV = (hlong*) calloc(Nelements*Nverts, sizeof(hlong)); - EX = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat)); - EY = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat)); - EZ = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat)); + EToV.malloc(Nelements*Nverts); + EX.malloc(Nelements*Nverts); + EY.malloc(Nelements*Nverts); + EZ.malloc(Nelements*Nverts); - elementInfo = (hlong*) calloc(Nelements, sizeof(hlong)); + elementInfo.malloc(Nelements); dlong e = 0; for(int k=0;k EToV, memory EX, memory EY, memory EZ, + memory elementInfo, int type, dlong &e) { const hlong i1 = (i0+1)%NnX; const hlong j1 = (j0+1)%NnY; @@ -946,9 +932,9 @@ static void addHex(hlong i0, hlong j0, hlong k0, hlong NnX, hlong NnY, hlong NnZ EToV[e*Nverts+6] = i1 + j1*NnX + k1*NnX*NnY; EToV[e*Nverts+7] = i0 + j1*NnX + k1*NnX*NnY; - dfloat *ex = EX+e*Nverts; - dfloat *ey = EY+e*Nverts; - dfloat *ez = EZ+e*Nverts; + dfloat *ex = EX.ptr()+e*Nverts; + dfloat *ey = EY.ptr()+e*Nverts; + dfloat *ez = EZ.ptr()+e*Nverts; ex[0] = x0; ey[0] = y0; ez[0] = z0; ex[1] = x0+dx; ey[1] = y0; ez[1] = z0; @@ -962,4 +948,6 @@ static void addHex(hlong i0, hlong j0, hlong k0, hlong NnX, hlong NnY, hlong NnZ elementInfo[e] = type; e++; -} \ No newline at end of file +} + +} //namespace libp diff --git a/libs/mesh/meshSetupPmlBoxQuad2D.cpp b/libs/mesh/meshSetupPmlBoxQuad2D.cpp index 60a3c5d9d..094de2c0d 100644 --- a/libs/mesh/meshSetupPmlBoxQuad2D.cpp +++ b/libs/mesh/meshSetupPmlBoxQuad2D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,35 +25,21 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh2D.hpp" -#include "mesh/mesh3D.hpp" -void meshQuad3D::SetupPmlBox(){ - LIBP_ABORT(string("PMLBOX mesh not currently supprted for Quad3D meshes.")) -} - -void meshQuad2D::SetupPmlBox(){ - - dim = 2; - Nverts = 4; // number of vertices per element - Nfaces = 4; - NfaceVertices = 2; +namespace libp { - // vertices on each face - int faceVertices_[4][2] = {{0,1},{1,2},{2,3},{3,0}}; - - faceVertices = (int*) calloc(NfaceVertices*Nfaces, sizeof(int)); - memcpy(faceVertices, faceVertices_[0], NfaceVertices*Nfaces*sizeof(int)); +void mesh_t::SetupPmlBoxQuad2D(){ // find a factorization size = size_x*size_y such that - // size_x>=size_y and are all 'close' to one another + // size_x>=size_y and are 'close' to one another int size_x, size_y; - factor2(size, size_x, size_y); + Factor2(size, size_x, size_y); - //find our coordinates in the MPI grid such that - // rank = rank_x + rank_y*size_x - int rank_y = rank / size_x; - int rank_x = rank % size_x; + //determine (x,y) rank coordinates for this processes + int rank_x=-1, rank_y=-1; + RankDecomp2(size_x, size_y, + rank_x, rank_y, + rank); //get global size from settings dlong NX, NY; @@ -81,8 +67,8 @@ void meshQuad2D::SetupPmlBox(){ settings.getSetting("BOX BOUNDARY FLAG", boundaryFlag); const int periodicFlag = (boundaryFlag == -1) ? 1 : 0; - if (periodicFlag) - LIBP_ABORT(string("Periodic boundary unsupported for PMLBOX mesh.")) + LIBP_ABORT("Periodic boundary unsupported for PMLBOX mesh.", + periodicFlag); //local grid physical sizes dfloat DIMX, DIMY; @@ -96,8 +82,8 @@ void meshQuad2D::SetupPmlBox(){ dfloat dx = DIMX/NX; dfloat dy = DIMY/NY; - dlong offset_x = rank_x*(NX/size_x) + mymin(rank_x, (NX % size_x)); - dlong offset_y = rank_y*(NY/size_y) + mymin(rank_y, (NY % size_y)); + dlong offset_x = rank_x*(NX/size_x) + std::min(rank_x, (NX % size_x)); + dlong offset_y = rank_y*(NY/size_y) + std::min(rank_y, (NY % size_y)); //local grid physical sizes dfloat dimx = nx*dx; @@ -143,11 +129,11 @@ void meshQuad2D::SetupPmlBox(){ if (rank_x==0 && rank_y==size_y-1) Nelements+=pmlNx*pmlNy; if (rank_x==size_x-1 && rank_y==size_y-1) Nelements+=pmlNx*pmlNy; - EToV = (hlong*) calloc(Nelements*Nverts, sizeof(hlong)); - EX = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat)); - EY = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat)); + EToV.malloc(Nelements*Nverts); + EX.malloc(Nelements*Nverts); + EY.malloc(Nelements*Nverts); - elementInfo = (hlong*) calloc(Nelements, sizeof(hlong)); + elementInfo.malloc(Nelements); dlong e = 0; @@ -168,8 +154,8 @@ void meshQuad2D::SetupPmlBox(){ dfloat x0 = X0 + dx*i; dfloat y0 = Y0 + dy*j; - dfloat *ex = EX+e*Nverts; - dfloat *ey = EY+e*Nverts; + dfloat *ex = EX.ptr()+e*Nverts; + dfloat *ey = EY.ptr()+e*Nverts; ex[0] = x0; ey[0] = y0; ex[1] = x0+dx; ey[1] = y0; @@ -199,8 +185,8 @@ void meshQuad2D::SetupPmlBox(){ dfloat x0 = X0-pmlWidthx + pmldx*i; dfloat y0 = Y0 + dy*j; - dfloat *ex = EX+e*Nverts; - dfloat *ey = EY+e*Nverts; + dfloat *ex = EX.ptr()+e*Nverts; + dfloat *ey = EY.ptr()+e*Nverts; ex[0] = x0; ey[0] = y0; ex[1] = x0+pmldx; ey[1] = y0; @@ -231,8 +217,8 @@ void meshQuad2D::SetupPmlBox(){ dfloat x0 = X0 + dimx + pmldx*i; dfloat y0 = Y0 + dy*j; - dfloat *ex = EX+e*Nverts; - dfloat *ey = EY+e*Nverts; + dfloat *ex = EX.ptr()+e*Nverts; + dfloat *ey = EY.ptr()+e*Nverts; ex[0] = x0; ey[0] = y0; ex[1] = x0+pmldx; ey[1] = y0; @@ -263,8 +249,8 @@ void meshQuad2D::SetupPmlBox(){ dfloat x0 = X0 + dx*i; dfloat y0 = Y0-pmlWidthy + pmldy*j; - dfloat *ex = EX+e*Nverts; - dfloat *ey = EY+e*Nverts; + dfloat *ex = EX.ptr()+e*Nverts; + dfloat *ey = EY.ptr()+e*Nverts; ex[0] = x0; ey[0] = y0; ex[1] = x0+dx; ey[1] = y0; @@ -295,8 +281,8 @@ void meshQuad2D::SetupPmlBox(){ dfloat x0 = X0 + dx*i; dfloat y0 = Y0 + dimy + pmldy*j; - dfloat *ex = EX+e*Nverts; - dfloat *ey = EY+e*Nverts; + dfloat *ex = EX.ptr()+e*Nverts; + dfloat *ey = EY.ptr()+e*Nverts; ex[0] = x0; ey[0] = y0; ex[1] = x0+dx; ey[1] = y0; @@ -327,8 +313,8 @@ void meshQuad2D::SetupPmlBox(){ dfloat x0 = X0-pmlWidthx + pmldx*i; dfloat y0 = Y0-pmlWidthy + pmldy*j; - dfloat *ex = EX+e*Nverts; - dfloat *ey = EY+e*Nverts; + dfloat *ex = EX.ptr()+e*Nverts; + dfloat *ey = EY.ptr()+e*Nverts; ex[0] = x0; ey[0] = y0; ex[1] = x0+pmldx; ey[1] = y0; @@ -359,8 +345,8 @@ void meshQuad2D::SetupPmlBox(){ dfloat x0 = X0+dimx + pmldx*i; dfloat y0 = Y0-pmlWidthy + pmldy*j; - dfloat *ex = EX+e*Nverts; - dfloat *ey = EY+e*Nverts; + dfloat *ex = EX.ptr()+e*Nverts; + dfloat *ey = EY.ptr()+e*Nverts; ex[0] = x0; ey[0] = y0; ex[1] = x0+pmldx; ey[1] = y0; @@ -391,8 +377,8 @@ void meshQuad2D::SetupPmlBox(){ dfloat x0 = X0-pmlWidthx + pmldx*i; dfloat y0 = Y0+dimy + pmldy*j; - dfloat *ex = EX+e*Nverts; - dfloat *ey = EY+e*Nverts; + dfloat *ex = EX.ptr()+e*Nverts; + dfloat *ey = EY.ptr()+e*Nverts; ex[0] = x0; ey[0] = y0; ex[1] = x0+pmldx; ey[1] = y0; @@ -423,8 +409,8 @@ void meshQuad2D::SetupPmlBox(){ dfloat x0 = X0+dimx + pmldx*i; dfloat y0 = Y0+dimy + pmldy*j; - dfloat *ex = EX+e*Nverts; - dfloat *ey = EY+e*Nverts; + dfloat *ex = EX.ptr()+e*Nverts; + dfloat *ey = EY.ptr()+e*Nverts; ex[0] = x0; ey[0] = y0; ex[1] = x0+pmldx; ey[1] = y0; @@ -439,7 +425,7 @@ void meshQuad2D::SetupPmlBox(){ if (boundaryFlag != -1) { //-1 reserved for periodic case NboundaryFaces = 2*NX + 2*NY; - boundaryInfo = (hlong*) calloc(NboundaryFaces*(NfaceVertices+1), sizeof(hlong)); + boundaryInfo.malloc(NboundaryFaces*(NfaceVertices+1)); hlong bcnt = 0; @@ -476,7 +462,8 @@ void meshQuad2D::SetupPmlBox(){ } } else { - NboundaryFaces = 0; - boundaryInfo = NULL; // no boundaries + NboundaryFaces = 0; // no boundaries } } + +} //namespace libp diff --git a/libs/mesh/meshSetupPmlBoxTet3D.cpp b/libs/mesh/meshSetupPmlBoxTet3D.cpp index 093631167..aaf5863ff 100644 --- a/libs/mesh/meshSetupPmlBoxTet3D.cpp +++ b/libs/mesh/meshSetupPmlBoxTet3D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,36 +25,26 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh3D.hpp" + +namespace libp { static void addTets(hlong i0, hlong j0, hlong k0, hlong NnX, hlong NnY, hlong NnZ, dfloat x0, dfloat y0, dfloat z0, dfloat dx, dfloat dy, dfloat dz, - hlong *EToV, dfloat *EX, dfloat *EY, dfloat *EZ, - hlong *elementInfo, int type, dlong &e); - -void meshTet3D::SetupPmlBox(){ - - dim = 3; - Nverts = 4; // number of vertices per element - Nfaces = 4; - NfaceVertices = 3; + memory EToV, memory EX, memory EY, memory EZ, + memory elementInfo, int type, dlong &e); - // vertices on each face - int faceVertices_[4][3] = {{0,1,2},{0,1,3},{1,2,3},{2,0,3}}; - - faceVertices = (int*) calloc(NfaceVertices*Nfaces, sizeof(int)); - memcpy(faceVertices, faceVertices_[0], 12*sizeof(int)); +void mesh_t::SetupPmlBoxTet3D(){ // find a factorization size = size_x*size_y*size_z such that // size_x>=size_y>=size_z are all 'close' to one another int size_x, size_y, size_z; - factor3(size, size_x, size_y, size_z); + Factor3(size, size_x, size_y, size_z); - //find our coordinates in the MPI grid such that - // rank = rank_x + rank_y*size_x + rank_z*size_x*size_y - int rank_z = rank/(size_x*size_y); - int rank_y = (rank-rank_z*size_x*size_y)/size_x; - int rank_x = rank % size_x; + //determine (x,y,z) rank coordinates for this processes + int rank_x=-1, rank_y=-1, rank_z=-1; + RankDecomp3(size_x, size_y, size_z, + rank_x, rank_y, rank_z, + rank); //get global size from settings dlong NX, NY, NZ; @@ -87,8 +77,8 @@ void meshTet3D::SetupPmlBox(){ settings.getSetting("BOX BOUNDARY FLAG", boundaryFlag); const int periodicFlag = (boundaryFlag == -1) ? 1 : 0; - if (periodicFlag) - LIBP_ABORT(string("Periodic boundary unsupported for PMLBOX mesh.")) + LIBP_ABORT("Periodic boundary unsupported for PMLBOX mesh.", + periodicFlag); //local grid physical sizes dfloat DIMX, DIMY, DIMZ; @@ -104,9 +94,9 @@ void meshTet3D::SetupPmlBox(){ dfloat dy = DIMY/NY; dfloat dz = DIMZ/NZ; - dlong offset_x = rank_x*(NX/size_x) + mymin(rank_x, (NX % size_x)); - dlong offset_y = rank_y*(NY/size_y) + mymin(rank_y, (NY % size_y)); - dlong offset_z = rank_z*(NZ/size_z) + mymin(rank_z, (NZ % size_z)); + dlong offset_x = rank_x*(NX/size_x) + std::min(rank_x, (NX % size_x)); + dlong offset_y = rank_y*(NY/size_y) + std::min(rank_y, (NY % size_y)); + dlong offset_z = rank_z*(NZ/size_z) + std::min(rank_z, (NZ % size_z)); //local grid physical sizes dfloat dimx = nx*dx; @@ -180,12 +170,12 @@ void meshTet3D::SetupPmlBox(){ if (rank_x==0 && rank_y==size_y-1 && rank_z==size_z-1) Nelements+=6*pmlNx*pmlNy*pmlNz; if (rank_x==size_x-1 && rank_y==size_y-1 && rank_z==size_z-1) Nelements+=6*pmlNx*pmlNy*pmlNz; - EToV = (hlong*) calloc(Nelements*Nverts, sizeof(hlong)); - EX = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat)); - EY = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat)); - EZ = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat)); + EToV.malloc(Nelements*Nverts); + EX.malloc(Nelements*Nverts); + EY.malloc(Nelements*Nverts); + EZ.malloc(Nelements*Nverts); - elementInfo = (hlong*) calloc(Nelements, sizeof(hlong)); + elementInfo.malloc(Nelements); dlong e = 0; for(int k=0;k EToV, memory EX, memory EY, memory EZ, + memory elementInfo, int type, dlong &e) { const hlong i1 = (i0+1)%NnX; const hlong j1 = (j0+1)%NnY; @@ -1038,4 +1027,6 @@ static void addTets(hlong i0, hlong j0, hlong k0, hlong NnX, hlong NnY, hlong Nn elementInfo[e] = type; e++; -} \ No newline at end of file +} + +} //namespace libp diff --git a/libs/mesh/meshSetupPmlBoxTri2D.cpp b/libs/mesh/meshSetupPmlBoxTri2D.cpp index a0db5f11e..0ab065d25 100644 --- a/libs/mesh/meshSetupPmlBoxTri2D.cpp +++ b/libs/mesh/meshSetupPmlBoxTri2D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,35 +25,21 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh2D.hpp" -#include "mesh/mesh3D.hpp" -void meshTri3D::SetupPmlBox(){ - LIBP_ABORT(string("PMLBOX mesh not currently supprted for Tri3D meshes.")) -} - -void meshTri2D::SetupPmlBox(){ - - dim = 2; - Nverts = 3; // number of vertices per element - Nfaces = 3; - NfaceVertices = 2; +namespace libp { - // vertices on each face - int faceVertices_[4][2] = {{0,1},{1,2},{2,0}}; - - faceVertices = (int*) calloc(NfaceVertices*Nfaces, sizeof(int)); - memcpy(faceVertices, faceVertices_[0], NfaceVertices*Nfaces*sizeof(int)); +void mesh_t::SetupPmlBoxTri2D(){ // find a factorization size = size_x*size_y such that - // size_x>=size_y and are all 'close' to one another + // size_x>=size_y and are 'close' to one another int size_x, size_y; - factor2(size, size_x, size_y); + Factor2(size, size_x, size_y); - //find our coordinates in the MPI grid such that - // rank = rank_x + rank_y*size_x - int rank_y = rank / size_x; - int rank_x = rank % size_x; + //determine (x,y) rank coordinates for this processes + int rank_x=-1, rank_y=-1; + RankDecomp2(size_x, size_y, + rank_x, rank_y, + rank); //get global size from settings dlong NX, NY; @@ -81,8 +67,8 @@ void meshTri2D::SetupPmlBox(){ settings.getSetting("BOX BOUNDARY FLAG", boundaryFlag); const int periodicFlag = (boundaryFlag == -1) ? 1 : 0; - if (periodicFlag) - LIBP_ABORT(string("Periodic boundary unsupported for PMLBOX mesh.")) + LIBP_ABORT("Periodic boundary unsupported for PMLBOX mesh.", + periodicFlag); //local grid physical sizes dfloat DIMX, DIMY; @@ -96,8 +82,8 @@ void meshTri2D::SetupPmlBox(){ dfloat dx = DIMX/NX; dfloat dy = DIMY/NY; - dlong offset_x = rank_x*(NX/size_x) + mymin(rank_x, (NX % size_x)); - dlong offset_y = rank_y*(NY/size_y) + mymin(rank_y, (NY % size_y)); + dlong offset_x = rank_x*(NX/size_x) + std::min(rank_x, (NX % size_x)); + dlong offset_y = rank_y*(NY/size_y) + std::min(rank_y, (NY % size_y)); //local grid physical sizes dfloat dimx = nx*dx; @@ -145,11 +131,11 @@ void meshTri2D::SetupPmlBox(){ if (rank_x==size_x-1 && rank_y==size_y-1) Nelements+=2*pmlNx*pmlNy; - EToV = (hlong*) calloc(Nelements*Nverts, sizeof(hlong)); - EX = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat)); - EY = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat)); + EToV.malloc(Nelements*Nverts); + EX.malloc(Nelements*Nverts); + EY.malloc(Nelements*Nverts); - elementInfo = (hlong*) calloc(Nelements, sizeof(hlong)); + elementInfo.malloc(Nelements); dlong e = 0; @@ -495,7 +481,7 @@ void meshTri2D::SetupPmlBox(){ if (boundaryFlag != -1) { //-1 reserved for periodic case NboundaryFaces = 2*NX + 2*NY; - boundaryInfo = (hlong*) calloc(NboundaryFaces*(NfaceVertices+1), sizeof(hlong)); + boundaryInfo.malloc(NboundaryFaces*(NfaceVertices+1)); hlong bcnt = 0; @@ -532,7 +518,8 @@ void meshTri2D::SetupPmlBox(){ } } else { - NboundaryFaces = 0; - boundaryInfo = NULL; // no boundaries + NboundaryFaces = 0; // no boundaries } } + +} //namespace libp diff --git a/libs/mesh/meshSetupRingPatch.cpp b/libs/mesh/meshSetupRingPatch.cpp index d3dcc966b..3d6c46b13 100644 --- a/libs/mesh/meshSetupRingPatch.cpp +++ b/libs/mesh/meshSetupRingPatch.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,195 +25,101 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh2D.hpp" -#include "mesh/mesh3D.hpp" + +namespace libp { //build a new mesh object consisting of the orignal mesh with an // 1-element overlap with neighboring meshes -mesh_t* mesh_t::SetupRingPatch(){ +mesh_t mesh_t::SetupRingPatch(){ //setup the 1-ring halo exchange HaloRingSetup(); + /*Copy underlying mesh object*/ + mesh_t mesh = *this; + //just reuse the current mesh if there are no neighbors - if (size==1) return this; + if (size==1) return mesh; // single process communicator for new mesh - MPI_Comm* splitComm = new MPI_Comm; - MPI_Comm_split(comm, rank, rank, splitComm); - - mesh_t *mesh=NULL; - switch(elementType){ - case TRIANGLES: - if(dim==2) - mesh = new meshTri2D(platform, settings, *splitComm); - else - mesh = new meshTri3D(platform, settings, *splitComm); - break; - case QUADRILATERALS: - if(dim==2) - mesh = new meshQuad2D(platform, settings, *splitComm); - else - mesh = new meshQuad3D(platform, settings, *splitComm); - break; - case TETRAHEDRA: - mesh = new meshTet3D(platform, settings, *splitComm); - break; - case HEXAHEDRA: - mesh = new meshHex3D(platform, settings, *splitComm); - break; - } - - //shallow copy of base mesh geometry - mesh->dim = dim; - mesh->Nverts = Nverts; - mesh->Nfaces = Nfaces; - mesh->NfaceVertices = NfaceVertices; - mesh->faceVertices = faceVertices; + mesh.comm = comm.Split(rank, rank); + mesh.rank = mesh.comm.rank(); + mesh.size = mesh.comm.size(); - mesh->elementType = elementType; - - mesh->Nnodes = Nnodes; //not really correct, but unused - mesh->Nelements = Nelements+totalRingElements; - mesh->NelementsGlobal = Nelements+totalRingElements; + mesh.Nelements = Nelements+totalRingElements; + mesh.NelementsGlobal = Nelements+totalRingElements; //populate mesh vertices - mesh->EX = (dfloat*) calloc(mesh->Nelements*Nverts, sizeof(dfloat)); - mesh->EY = (dfloat*) calloc(mesh->Nelements*Nverts, sizeof(dfloat)); + mesh.EX.malloc(mesh.Nelements*Nverts); + mesh.EY.malloc(mesh.Nelements*Nverts); if(dim==3) - mesh->EZ = (dfloat*) calloc(mesh->Nelements*Nverts, sizeof(dfloat)); + mesh.EZ.malloc(mesh.Nelements*Nverts); - memcpy(mesh->EX, EX, Nelements*Nverts*sizeof(dfloat)); - memcpy(mesh->EY, EY, Nelements*Nverts*sizeof(dfloat)); + mesh.EX.copyFrom(EX, Nelements*Nverts); + mesh.EY.copyFrom(EY, Nelements*Nverts); if(dim==3) - memcpy(mesh->EZ, EZ, Nelements*Nverts*sizeof(dfloat)); + mesh.EZ.copyFrom(EZ, Nelements*Nverts); - ringHalo->Exchange(mesh->EX, Nverts, ogs_dfloat); - ringHalo->Exchange(mesh->EY, Nverts, ogs_dfloat); + ringHalo.Exchange(mesh.EX, Nverts); + ringHalo.Exchange(mesh.EY, Nverts); if(dim==3) - ringHalo->Exchange(mesh->EZ, Nverts, ogs_dfloat); + ringHalo.Exchange(mesh.EZ, Nverts); - mesh->EToV = (hlong*) calloc(mesh->Nelements*Nverts, sizeof(hlong)); - memcpy(mesh->EToV, EToV, Nelements*Nverts*sizeof(hlong)); - ringHalo->Exchange(mesh->EToV, Nverts, ogs_hlong); + mesh.EToV.malloc(mesh.Nelements*Nverts); + mesh.EToV.copyFrom(EToV, Nelements*Nverts); + ringHalo.Exchange(mesh.EToV, Nverts); - mesh->elementInfo = (hlong*) calloc(mesh->Nelements, sizeof(hlong)); - memcpy(mesh->elementInfo, elementInfo, Nelements*sizeof(hlong)); - ringHalo->Exchange(mesh->elementInfo, 1, ogs_hlong); + mesh.elementInfo.malloc(mesh.Nelements); + mesh.elementInfo.copyFrom(elementInfo, Nelements); + ringHalo.Exchange(mesh.elementInfo, 1); // connect elements using parallel sort - mesh->ParallelConnect(); + mesh.Connect(); - mesh->NboundaryFaces = NboundaryFaces; - mesh->boundaryInfo = boundaryInfo; + mesh.NboundaryFaces = NboundaryFaces; + mesh.boundaryInfo = boundaryInfo; // element-to-boundary condition type - mesh->EToB = (int*) calloc(mesh->Nelements*Nfaces, sizeof(int)); - memcpy(mesh->EToB, EToB, Nelements*Nfaces*sizeof(int)); - ringHalo->Exchange(mesh->EToB, Nfaces, ogs_int); + mesh.EToB.malloc(mesh.Nelements*Nfaces); + mesh.EToB.copyFrom(EToB, Nelements*Nfaces); + ringHalo.Exchange(mesh.EToB, Nfaces); // correct bcs (replaces unconnected faces with Dirichlet) - for(dlong e=0;eNelements;++e){ + for(dlong e=0;eEToE[id]==-1 && mesh->EToB[id]==-1){ - mesh->EToB[id] = 1; // hack to 1 assume Dirichlet - mesh->EToE[id] = e; // hack to 1 assume Dirichlet + if(mesh.EToE[id]==-1 && mesh.EToB[id]==-1){ + mesh.EToB[id] = 1; // hack to 1 assume Dirichlet + mesh.EToE[id] = e; // hack to 1 assume Dirichlet } } } - //Reference Nodes - mesh->N = N; - mesh->Np = Np; - mesh->Nq = Nq; - mesh->Nfp = Nfp; - - mesh->vertexNodes = vertexNodes; - - mesh->r = r; - mesh->s = s; - mesh->t = t; - - mesh->w = w; - - mesh->D = D; - mesh->Dr = Dr; - mesh->Ds = Ds; - mesh->Dt = Dt; - mesh->S = S; - mesh->Srr = Srr; - mesh->Srs = Srs; - mesh->Srt = Srt; - mesh->Sss = Sss; - mesh->Sst = Sst; - mesh->Stt = Stt; - mesh->MM = MM; - mesh->invMM = invMM; - mesh->sM = sM; - mesh->faceNodes = faceNodes; - mesh->LIFT = LIFT; - - mesh->plotNp = plotNp; - mesh->plotNelements = plotNelements; - mesh->plotNverts = plotNverts; - mesh->plotR = plotR; - mesh->plotS = plotS; - mesh->plotT = plotT; - mesh->plotInterp = plotInterp; - mesh->plotEToV = plotEToV; - - mesh->cubNp = cubNp; - mesh->cubNq = cubNq; - mesh->cubNfp = cubNfp; - mesh->cubr = cubr; - mesh->cubs = cubs; - mesh->cubt = cubt; - mesh->cubw = cubw; - mesh->cubInterp = cubInterp; - mesh->cubProject = cubProject; - mesh->cubD = cubD; - mesh->cubPDT = cubPDT; - mesh->cubPDrT = cubPDrT; - mesh->cubPDsT = cubPDsT; - mesh->cubPDtT = cubPDtT; - mesh->intNfp = intNfp; - mesh->intInterp = intInterp; - mesh->intLIFT = intLIFT; - - mesh->NpFEM = NpFEM; - mesh->NelFEM = NelFEM; - mesh->rFEM = rFEM; - mesh->sFEM = sFEM; - mesh->tFEM = tFEM; - mesh->SEMFEMInterp = SEMFEMInterp; - mesh->FEMEToV = FEMEToV; - - - mesh->ringHalo = NULL; - //Halo - mesh->HaloSetup(); + mesh.HaloSetup(); - // compute physical (x,y) locations of the element nodes - mesh->PhysicalNodes(); - - // compute geometric factors - mesh->GeometricFactors(); + // connect face vertices + mesh.ConnectFaceVertices(); // connect face nodes (find trace indices) - mesh->ConnectFaceNodes(); + mesh.ConnectFaceNodes(); - // compute surface geofacs - mesh->SurfaceGeometricFactors(); + // make global indexing + mesh.ConnectNodes(); - // make a global indexing - mesh->ParallelConnectNodes(); + // compute physical (x,y) locations of the element nodes + mesh.PhysicalNodes(); - // make an ogs operator and label local/global gather elements - mesh->ParallelGatherScatterSetup(); + // compute geometric factors + mesh.GeometricFactors(); - mesh->OccaSetup(); + // compute surface geofacs + mesh.SurfaceGeometricFactors(); + + // label local/global gather elements + mesh.GatherScatterSetup(); return mesh; } + +} //namespace libp diff --git a/libs/mesh/meshSetupSEMFEM.cpp b/libs/mesh/meshSetupSEMFEM.cpp index 1d0f3b438..f325c370d 100644 --- a/libs/mesh/meshSetupSEMFEM.cpp +++ b/libs/mesh/meshSetupSEMFEM.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,170 +25,95 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh2D.hpp" -#include "mesh/mesh3D.hpp" -mesh_t* mesh_t::SetupSEMFEM(hlong **globalIds_, int *Nfp_, int **faceNodes_){ +namespace libp { + +mesh_t mesh_t::SetupSEMFEM(memory& globalIds_, + memory& mapB_){ //partially assembled fem mesh (result of projecting sem element to larger space) - mesh_t *pmesh=NULL; - switch(elementType){ - //quads and hexes reuse the SEM ndoes for the FEM problem - case QUADRILATERALS: - pmesh = this; break; - case TETRAHEDRA: - pmesh = this; break; - case HEXAHEDRA: - pmesh = this; break; - case TRIANGLES: - if(dim==2) - pmesh = new meshTri2D(platform, settings, comm); - else - pmesh = new meshTri3D(platform, settings, comm); - break; - } + mesh_t pmesh=*this; //setup the intermediate mesh for tris and tets - if (elementType==TRIANGLES) { - pmesh->dim = dim; - pmesh->elementType = elementType; - pmesh->Nverts = Nverts; - pmesh->Nfaces = Nfaces; - pmesh->NfaceVertices = NfaceVertices; - pmesh->faceVertices = faceVertices; - + if (elementType==Mesh::TRIANGLES) { /* SEMFEM data */ - SEMFEMNodesTri2D(N, &NpFEM, &rFEM, &sFEM); - SEMFEMEToVTri2D(N, &NelFEM, &FEMEToV); + SEMFEMNodesTri2D(N, NpFEM, rFEM, sFEM); + SEMFEMEToVTri2D(N, NelFEM, FEMEToV); - SEMFEMInterp = (dfloat*) calloc(NpFEM*Np, sizeof(dfloat)); - SEMFEMInterpMatrixTri2D(N, Np, r, s, NpFEM, rFEM, sFEM, SEMFEMInterp); + SEMFEMInterpMatrixTri2D(N, r, s, rFEM, sFEM, SEMFEMInterp); //set semfem nodes as the grid points - pmesh->Np = NpFEM; - pmesh->r = rFEM; - pmesh->s = sFEM; + pmesh.Np = NpFEM; + pmesh.r = rFEM; + pmesh.s = sFEM; //count number of face nodes in the semfem element dfloat NODETOL = 1e-6; - pmesh->Nfp=0; - for (int n=0;nNp;n++) - if (fabs(pmesh->s[n]+1)Nfp++; + pmesh.Nfp=0; + for (int n=0;nfaceNodes = (int *) calloc(Nfaces*pmesh->Nfp,sizeof(int)); + pmesh.faceNodes.malloc(Nfaces*pmesh.Nfp); int f0=0, f1=0, f2=0; - for (int n=0;nNp;n++) { - if (fabs(pmesh->s[n]+1)faceNodes[0*pmesh->Nfp+f0++] = n; - if (fabs(pmesh->r[n]+pmesh->s[n])faceNodes[1*pmesh->Nfp+f1++] = n; - if (fabs(pmesh->r[n]+1)faceNodes[2*pmesh->Nfp+f2++] = n; + for (int n=0;nvertexNodes = (int*) calloc(Nverts, sizeof(int)); - for(int n=0;nNp;++n){ - if( (pmesh->r[n]+1)*(pmesh->r[n]+1)+(pmesh->s[n]+1)*(pmesh->s[n]+1)vertexNodes[0] = n; - if( (pmesh->r[n]-1)*(pmesh->r[n]-1)+(pmesh->s[n]+1)*(pmesh->s[n]+1)vertexNodes[1] = n; - if( (pmesh->r[n]+1)*(pmesh->r[n]+1)+(pmesh->s[n]-1)*(pmesh->s[n]-1)vertexNodes[2] = n; + pmesh.vertexNodes.malloc(Nverts); + for(int n=0;nNnodes = Nnodes; - pmesh->EX = EX; // coordinates of vertices for each element - pmesh->EY = EY; - pmesh->EZ = EZ; - - pmesh->Nelements = Nelements; - pmesh->NelementsGlobal = NelementsGlobal; - pmesh->EToV = EToV; // element-to-vertex connectivity - pmesh->EToE = EToE; // element-to-element connectivity - pmesh->EToF = EToF; // element-to-(local)face connectivity - pmesh->EToP = EToP; // element-to-partition/process connectivity - pmesh->EToB = EToB; // element-to-boundary condition type - - pmesh->elementInfo = elementInfo; - - pmesh->NboundaryFaces = NboundaryFaces; - pmesh->boundaryInfo = boundaryInfo; - - //use existing halo - pmesh->halo = halo; - pmesh->NinternalElements = NinternalElements; - pmesh->NhaloElements = NhaloElements; - pmesh->totalHaloPairs = totalHaloPairs; - pmesh->internalElementIds = internalElementIds; - pmesh->haloElementIds = haloElementIds; - // compute physical (x,y) locations FEM vertices - pmesh->PhysicalNodes(); + pmesh.PhysicalNodes(); // connect face nodes (find trace indices) - pmesh->ConnectFaceNodes(); + pmesh.ConnectFaceNodes(); // make a global indexing - pmesh->ParallelConnectNodes(); - //pmesh->globalIds is now populated - } - - //need to return this data - *globalIds_ = pmesh->globalIds; - *Nfp_ = pmesh->Nfp; - *faceNodes_ = pmesh->faceNodes; - - //now build the full degree 1 fem mesh - mesh_t *femMesh=NULL; - switch(elementType){ - case TRIANGLES: - if(dim==2) - femMesh = new meshTri2D(platform, settings, comm); - else - femMesh = new meshTri3D(platform, settings, comm); - break; - case QUADRILATERALS: - if(dim==2) - femMesh = new meshQuad2D(platform, settings, comm); - else - femMesh = new meshQuad3D(platform, settings, comm); + pmesh.ConnectNodes(); + //pmesh.globalIds is now populated + //pmesh.mapB is now populated + } else if (elementType==Mesh::QUADRILATERALS) { NpFEM = Np; NelFEM = N*N; - FEMEToV = (int*) malloc(NelFEM*Nverts*sizeof(int)); SEMFEMEToVQuad2D(N, FEMEToV); - break; - case TETRAHEDRA: - femMesh = new meshTet3D(platform, settings, comm); + } else if (elementType==Mesh::TETRAHEDRA){ NpFEM = Np; NelFEM = N*N*N; - FEMEToV = (int*) malloc(NelFEM*Nverts*sizeof(int)); SEMFEMEToVTet3D(N, FEMEToV); - break; - case HEXAHEDRA: - femMesh = new meshHex3D(platform, settings, comm); + } else { //Mesh::HEXAHEDRA NpFEM = Np; NelFEM = N*N*N; - FEMEToV = (int*) malloc(NelFEM*Nverts*sizeof(int)); SEMFEMEToVHex3D(N, FEMEToV); - break; } - int femN = 1; //degree of fem approximation - femMesh->dim = dim; - femMesh->elementType = elementType; - femMesh->Nverts = Nverts; - femMesh->Nfaces = Nfaces; - femMesh->NfaceVertices = NfaceVertices; - femMesh->faceVertices = faceVertices; + //need to return this data + globalIds_ = pmesh.globalIds; + mapB_ = pmesh.mapB; + + //now build the full degree 1 fem mesh + mesh_t femMesh=*this; + + femMesh.N = 1; //degree of fem approximation /* allocate space for node coordinates */ - femMesh->Nelements = NelFEM*Nelements; - dlong NFEMverts = femMesh->Nelements*Nverts; - femMesh->EToV = (hlong*) calloc(NFEMverts, sizeof(hlong)); - femMesh->EX = (dfloat*) calloc(NFEMverts, sizeof(dfloat)); - femMesh->EY = (dfloat*) calloc(NFEMverts, sizeof(dfloat)); + femMesh.Nelements = NelFEM*Nelements; + dlong NFEMverts = femMesh.Nelements*Nverts; + femMesh.EToV.malloc(NFEMverts); + femMesh.EX.malloc(NFEMverts); + femMesh.EY.malloc(NFEMverts); if (dim==3) - femMesh->EZ = (dfloat*) calloc(NFEMverts, sizeof(dfloat)); + femMesh.EZ.malloc(NFEMverts); for(dlong e=0;eEToV[femId+i] = pmesh->globalIds[id]; + femMesh.EToV[femId+i] = pmesh.globalIds[id]; - femMesh->EX[femId+i] = pmesh->x[id]; - femMesh->EY[femId+i] = pmesh->y[id]; + femMesh.EX[femId+i] = pmesh.x[id]; + femMesh.EY[femId+i] = pmesh.y[id]; if (dim==3) - femMesh->EZ[femId+i] = pmesh->z[id]; + femMesh.EZ[femId+i] = pmesh.z[id]; } } } - // connect elements using parallel sort - femMesh->ParallelConnect(); - // load reference (r,s) element nodes - femMesh->ReferenceNodes(femN); + femMesh.ReferenceNodes(); + + // connect elements using parallel sort + femMesh.Connect(); //identify the nodes on the SEMFEM element faces - int *faceFlag = (int*) calloc(pmesh->Np*Nfaces,sizeof(int)); + memory faceFlag(pmesh.Np*Nfaces, 0); for (int f=0;fNfp;n++) { - int id = pmesh->faceNodes[f*pmesh->Nfp+n]; - faceFlag[f*pmesh->Np + id] = 1; //flag the nodes on this face + for (int n=0;nNfaces,sizeof(int)); - for (int n=0;nNfaces;n++) femFaceMap[n] = -1; + memory femFaceMap(NelFEM*femMesh.Nfaces, 0); + for (int n=0;nNfaces;f++) { + for (int f=0;fNfp;i++){ - int id = femMesh->faceNodes[f*femMesh->Nfp+i]; + for (int i=0;iNp + v]; + NvertsOnFace += faceFlag[face*pmesh.Np + v]; } - if (NvertsOnFace == femMesh->Nfp) - femFaceMap[n*femMesh->Nfaces+f] = face; //on macro face + if (NvertsOnFace == femMesh.Nfp) + femFaceMap[n*femMesh.Nfaces+f] = face; //on macro face } } } //fill the boundary flag array from the original EToB - femMesh->EToB = (int*) calloc(femMesh->Nelements*femMesh->Nfaces, sizeof(int)); + femMesh.EToB.malloc(femMesh.Nelements*femMesh.Nfaces, 0); for (dlong e=0;eNfaces;f++) { - int face = femFaceMap[n*femMesh->Nfaces+f]; + for (int f=0;f-1) { - femMesh->EToB[(e*NelFEM +n)*femMesh->Nfaces +f] = EToB[e*Nfaces + face]; + femMesh.EToB[(e*NelFEM +n)*femMesh.Nfaces +f] = EToB[e*Nfaces + face]; } } } } - free(faceFlag); - free(femFaceMap); // set up halo exchange info for MPI (do before connect face nodes) - femMesh->HaloSetup(); + femMesh.HaloSetup(); - // compute physical (x,y) locations of the element nodes - femMesh->PhysicalNodes(); - - // compute geometric factors - femMesh->GeometricFactors(); + // connect face vertices + femMesh.ConnectFaceVertices(); // connect face nodes (find trace indices) - // femMesh->ConnectFaceNodes(); + femMesh.ConnectFaceNodes(); - // compute surface geofacs - // femMesh->SurfaceGeometricFactors(); + // make global indexing + femMesh.ConnectNodes(); + + // compute physical (x,y) locations of the element nodes + femMesh.PhysicalNodes(); - // make a global indexing - //femMesh->ParallelConnectNodes(); + // compute geometric factors + femMesh.GeometricFactors(); - // make an ogs operator and label local/global gather elements - //femMesh->ParallelGatherScatterSetup(); + // compute surface geofacs + // femMesh.SurfaceGeometricFactors(); - //dont need to setup occa buffers for this mesh - // femMesh->OccaSetup(); + // label local/global gather elements + femMesh.GatherScatterSetup(); return femMesh; } + +} //namespace libp diff --git a/libs/mesh/meshSurfaceGeometricFactorsHex3D.cpp b/libs/mesh/meshSurfaceGeometricFactorsHex3D.cpp index eb02b7f45..616eafb1d 100644 --- a/libs/mesh/meshSurfaceGeometricFactorsHex3D.cpp +++ b/libs/mesh/meshSurfaceGeometricFactorsHex3D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,72 +25,49 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh3D.hpp" -/* -static void computeFrame(dfloat nx, dfloat ny, dfloat nz, - dfloat &tanx, dfloat &tany, dfloat &tanz, - dfloat &binx, dfloat &biny, dfloat &binz){ - - dfloat rdotn, ranx, rany, ranz; - do{ - ranx = drand48(); - rany = drand48(); - ranz = drand48(); - - dfloat magran = sqrt(ranx*ranx+rany*rany+ranz*ranz); - - ranx /= magran; - rany /= magran; - ranz /= magran; - - rdotn = nx*ranx+ny*rany+nz*ranz; - }while(fabs(rdotn)<1e-4); - - tanx = ny*ranz - nz*rany; - tany = nz*ranx - nx*ranz; - tanz = nx*rany - ny*ranx; - - dfloat magtan = sqrt(tanx*tanx+tany*tany+tanz*tanz); - - tanx /= magtan; - tany /= magtan; - tanz /= magtan; - - binx = ny*tanz - nz*tany; - biny = nz*tanx - nx*tanz; - binz = nx*tany - ny*tanx; - - dfloat magbin = sqrt(binx*binx+biny*biny+binz*binz); - - binx /= magbin; - biny /= magbin; - binz /= magbin; - - // printf("nor = %g,%g,%g; tan = %g,%g,%g; bin = %g,%g,%g\n", nx, ny, nz, tanx, tany, tanz, binx, biny, binz); -} -*/ +namespace libp { /* compute outwards facing normals, surface Jacobian, and volume Jacobian for all face nodes */ -void meshHex3D::SurfaceGeometricFactors(){ +void mesh_t::SurfaceGeometricFactorsHex3D(){ /* unified storage array for geometric factors */ - Nsgeo = 8; //17; (old) - sgeo = (dfloat*) calloc((Nelements+totalHaloPairs)* - Nsgeo*Nfp*Nfaces, - sizeof(dfloat)); - - dfloat *xre = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *xse = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *xte = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *yre = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *yse = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *yte = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *zre = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *zse = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *zte = (dfloat*) calloc(Np, sizeof(dfloat)); - - for(dlong e=0;e h((Nelements+totalHaloPairs)*Nfp*Nfaces); + + memory xre(Np); + memory xse(Np); + memory xte(Np); + memory yre(Np); + memory yse(Np); + memory yte(Np); + memory zre(Np); + memory zse(Np); + memory zte(Np); + + for(dlong e=0;e 0) { //enforce a stronger penalty on boundaries + // sgeo[baseM*Nsgeo+IHID] *= 2; + // } + } } } - free(xre); free(xse); free(xte); - free(yre); free(yse); free(yte); - free(zre); free(zse); free(zte); + o_sgeo = platform.malloc(sgeo); } + +} //namespace libp diff --git a/libs/mesh/meshSurfaceGeometricFactorsQuad2D.cpp b/libs/mesh/meshSurfaceGeometricFactorsQuad2D.cpp index 44d288d4b..4c5e40bad 100644 --- a/libs/mesh/meshSurfaceGeometricFactorsQuad2D.cpp +++ b/libs/mesh/meshSurfaceGeometricFactorsQuad2D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,23 +25,42 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh2D.hpp" + +namespace libp { /* compute outwards facing normals, surface Jacobian, and volume Jacobian for all face nodes */ -void meshQuad2D::SurfaceGeometricFactors(){ +void mesh_t::SurfaceGeometricFactorsQuad2D(){ /* unified storage array for geometric factors */ Nsgeo = 7; - sgeo = (dfloat*) calloc((Nelements+totalHaloPairs)* - Nsgeo*Nfp*Nfaces, - sizeof(dfloat)); - dfloat *xre = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *xse = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *yre = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *yse = (dfloat*) calloc(Np, sizeof(dfloat)); + NXID = 0; + NYID = 1; + SJID = 2; + IJID = 3; + IHID = 4; + WSJID = 5; + WIJID = 6; + + props["defines/" "p_Nsgeo"]= Nsgeo; + props["defines/" "p_NXID"]= NXID; + props["defines/" "p_NYID"]= NYID; + props["defines/" "p_SJID"]= SJID; + props["defines/" "p_IJID"]= IJID; + props["defines/" "p_IHID"]= IHID; + props["defines/" "p_WSJID"]= WSJID; + props["defines/" "p_WIJID"]= WIJID; + + sgeo.malloc(Nelements*Nsgeo*Nfp*Nfaces); - for(dlong e=0;e hinv((Nelements+totalHaloPairs)*Nfp*Nfaces); + + memory xre(Np); + memory xse(Np); + memory yre(Np); + memory yse(Np); + + for(dlong e=0;e 0) { //enforce a stronger penalty on boundaries + // sgeo[baseM*Nsgeo+IHID] *= 2; + // } + } } } + + o_sgeo = platform.malloc(sgeo); } + +} //namespace libp diff --git a/libs/mesh/meshSurfaceGeometricFactorsQuad3D.cpp b/libs/mesh/meshSurfaceGeometricFactorsQuad3D.cpp index b1816330f..b2f201e31 100644 --- a/libs/mesh/meshSurfaceGeometricFactorsQuad3D.cpp +++ b/libs/mesh/meshSurfaceGeometricFactorsQuad3D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,43 +25,62 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh3D.hpp" + +namespace libp { /* compute outwards facing normals, surface Jacobian, and volume Jacobian for all face nodes */ -void meshQuad3D::SurfaceGeometricFactors(){ +void mesh_t::SurfaceGeometricFactorsQuad3D(){ /* unified storage array for geometric factors */ - Nsgeo = 14; // fix later - sgeo = (dfloat*) calloc((Nelements+totalHaloPairs)* - Nsgeo*Nfp*Nfaces, - sizeof(dfloat)); + Nsgeo = 8; + + NXID = 0; + NYID = 1; + NZID = 2; + SJID = 3; + IJID = 4; + IHID = 5; + WSJID = 6; + WIJID = 7; + + props["defines/" "p_Nsgeo"]= Nsgeo; + props["defines/" "p_NXID"]= NXID; + props["defines/" "p_NYID"]= NYID; + props["defines/" "p_NZID"]= NZID; + props["defines/" "p_SJID"]= SJID; + props["defines/" "p_IJID"]= IJID; + props["defines/" "p_IHID"]= IHID; + props["defines/" "p_WSJID"]= WSJID; + props["defines/" "p_WIJID"]= WIJID; + + sgeo.malloc(Nelements*Nsgeo*Nfp*Nfaces); - cubsgeo = (dfloat*) calloc((Nelements+totalHaloPairs)* - Nsgeo*cubNq*Nfaces, - sizeof(dfloat)); + memory hinv((Nelements+totalHaloPairs)*Nfp*Nfaces); - dfloat *_cubx = (dfloat*) calloc((Nelements+totalHaloPairs)* - cubNq*Nfaces, sizeof(dfloat)); + // cubsgeo.malloc(Nelements*Nsgeo*cubNfp*Nfaces); - dfloat *_cuby = (dfloat*) calloc((Nelements+totalHaloPairs)* - cubNq*Nfaces, sizeof(dfloat)); + // dfloat *_cubx = (dfloat*) calloc((Nelements+totalHaloPairs)* + // cubNq*Nfaces, sizeof(dfloat)); - dfloat *_cubz = (dfloat*) calloc((Nelements+totalHaloPairs)* - cubNq*Nfaces, sizeof(dfloat)); + // dfloat *_cuby = (dfloat*) calloc((Nelements+totalHaloPairs)* + // cubNq*Nfaces, sizeof(dfloat)); + // dfloat *_cubz = (dfloat*) calloc((Nelements+totalHaloPairs)* + // cubNq*Nfaces, sizeof(dfloat)); - dfloat *xr = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *yr = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *zr = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *xs = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *ys = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *zs = (dfloat*) calloc(Np, sizeof(dfloat)); + memory xr(Np); + memory yr(Np); + memory zr(Np); - dfloat *J = (dfloat*) calloc(Np, sizeof(dfloat)); + memory xs(Np); + memory ys(Np); + memory zs(Np); - for(int e=0;e J(Np); + + for(int e=0;e1e-12) - printf("mindist2 = %g\n", mindist2); - - idM = Nsgeo*( e*cubNq*Nfaces+ f*cubNq+n)+IHID; - idP = Nsgeo*(eP*cubNq*Nfaces+fP*cubNq+minidP)+IHID; - - dfloat hinv = mymax(cubsgeo[idM],cubsgeo[idP]); - cubsgeo[idM] = hinv; - cubsgeo[idP] = hinv; - - } + dfloat hinvM = hinv[baseM]; + dfloat hinvP = hinv[baseP]; + sgeo[baseM*Nsgeo+IHID] = std::max(hinvM,hinvP); } } - + // for(dlong e=0;e1e-12) + // printf("mindist2 = %g\n", mindist2); + + // idM = Nsgeo*( e*cubNq*Nfaces+ f*cubNq+n)+IHID; + // idP = Nsgeo*(eP*cubNq*Nfaces+fP*cubNq+minidP)+IHID; + + // dfloat hinv = mymax(cubsgeo[idM],cubsgeo[idP]); + // cubsgeo[idM] = hinv; + // cubsgeo[idP] = hinv; + + // } + // } + // } + + o_sgeo = platform.malloc(sgeo); } + +} //namespace libp diff --git a/libs/mesh/meshSurfaceGeometricFactorsTet3D.cpp b/libs/mesh/meshSurfaceGeometricFactorsTet3D.cpp index 727356ba9..02bbf7323 100644 --- a/libs/mesh/meshSurfaceGeometricFactorsTet3D.cpp +++ b/libs/mesh/meshSurfaceGeometricFactorsTet3D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,53 +25,34 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh3D.hpp" -static void computeFrameTet3D(dfloat nx, dfloat ny, dfloat nz, - dfloat &tanx, dfloat &tany, dfloat &tanz, - dfloat &binx, dfloat &biny, dfloat &binz){ +namespace libp { - dfloat ranx = drand48(); - dfloat rany = drand48(); - dfloat ranz = drand48(); +void mesh_t::SurfaceGeometricFactorsTet3D(){ - dfloat magran = sqrt(ranx*ranx+rany*rany+ranz*ranz); - - ranx /= magran; - rany /= magran; - ranz /= magran; - - tanx = ny*ranz - nz*rany; - tany = nz*ranx - nx*ranz; - tanz = nx*rany - ny*ranx; - - dfloat magtan = sqrt(tanx*tanx+tany*tany+tanz*tanz); - - tanx /= magtan; - tany /= magtan; - tanz /= magtan; + /* unified storage array for geometric factors */ + Nsgeo = 6; - binx = ny*tanz - nz*tany; - biny = nz*tanx - nx*tanz; - binz = nx*tany - ny*tanx; + NXID = 0; + NYID = 1; + NZID = 2; + SJID = 3; + IJID = 4; + IHID = 5; - dfloat magbin = sqrt(binx*binx+biny*biny+binz*binz); + props["defines/" "p_Nsgeo"]= Nsgeo; + props["defines/" "p_NXID"]= NXID; + props["defines/" "p_NYID"]= NYID; + props["defines/" "p_NZID"]= NZID; + props["defines/" "p_SJID"]= SJID; + props["defines/" "p_IJID"]= IJID; + props["defines/" "p_IHID"]= IHID; - binx /= magbin; - biny /= magbin; - binz /= magbin; + sgeo.malloc(Nelements*Nsgeo*Nfaces); - // printf("nor = %g,%g,%g; tan = %g,%g,%g; bin = %g,%g,%g\n", nx, ny, nz, tanx, tany, tanz, binx, biny, binz); -} + memory hinv((Nelements+totalHaloPairs)*Nfaces); -void meshTet3D::SurfaceGeometricFactors(){ - - /* unified storage array for geometric factors */ - Nsgeo = 14; - sgeo = (dfloat*) calloc((Nelements+totalHaloPairs)* - Nsgeo*Nfaces, sizeof(dfloat)); - - for(dlong e=0;e=0) ? (idP/Np):e; + for(dlong eM=0;eM (J*4/3) = (sJ*2)*h/3 => h = 2*J/sJ - dfloat hinvM = 0.5*sgeo[baseM*Nsgeo + SJID]*sgeo[baseM*Nsgeo + IJID]; - dfloat hinvP = 0.5*sgeo[baseP*Nsgeo + SJID]*sgeo[baseP*Nsgeo + IJID]; + // rescaling - A = L*h/2 => (J*2) = (sJ*2)*h/2 => h = 2*J/sJ + dfloat hinvM = hinv[baseM]; + dfloat hinvP = hinv[baseP]; + sgeo[baseM*Nsgeo+IHID] = std::max(hinvM,hinvP); - sgeo[baseM*Nsgeo+IHID] = mymax(hinvM,hinvP); - sgeo[baseP*Nsgeo+IHID] = mymax(hinvM,hinvP); + // if (EToB[fM+eM*Nfaces] > 0) { //enforce a stronger penalty on boundaries + // sgeo[baseM*Nsgeo+IHID] *= 2; + // } } } + + o_sgeo = platform.malloc(sgeo); } + +} //namespace libp diff --git a/libs/mesh/meshSurfaceGeometricFactorsTri2D.cpp b/libs/mesh/meshSurfaceGeometricFactorsTri2D.cpp index 94de0bccd..80d6dc430 100644 --- a/libs/mesh/meshSurfaceGeometricFactorsTri2D.cpp +++ b/libs/mesh/meshSurfaceGeometricFactorsTri2D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,17 +25,32 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh2D.hpp" -void meshTri2D::SurfaceGeometricFactors(){ +namespace libp { + +void mesh_t::SurfaceGeometricFactorsTri2D(){ /* unified storage array for geometric factors */ - Nsgeo = 6; - sgeo = (dfloat*) calloc((Nelements+totalHaloPairs)* - Nsgeo*Nfaces, - sizeof(dfloat)); + Nsgeo = 5; + + NXID = 0; + NYID = 1; + SJID = 2; + IJID = 3; + IHID = 4; + + props["defines/" "p_Nsgeo"]= Nsgeo; + props["defines/" "p_NXID"]= NXID; + props["defines/" "p_NYID"]= NYID; + props["defines/" "p_SJID"]= SJID; + props["defines/" "p_IJID"]= IJID; + props["defines/" "p_IHID"]= IHID; + + sgeo.malloc(Nelements*Nsgeo*Nfaces); - for(dlong e=0;e hinv((Nelements+totalHaloPairs)*Nfaces); + + for(dlong e=0;ed2 sgeo[base+IJID] = 1./J; + hinv[Nfaces*e+1] = 0.25*d2/J; + /* face 3 */ base += Nsgeo; dfloat nx3 = ye1-ye3; @@ -88,46 +105,45 @@ void meshTri2D::SurfaceGeometricFactors(){ sgeo[base+NYID] = ny3/d3; sgeo[base+SJID] = d3/2.; sgeo[base+IJID] = 1./J; + + hinv[Nfaces*e+2] = 0.25*d3/J; } + halo.Exchange(hinv, Nfaces); - dfloat href = 0.; - dfloat tol = 1.; - for(dlong e=0;e (J*2) = (sJ*2)*h/2 => h = 2*J/sJ - dfloat hinvM = sgeo[baseM*Nsgeo + SJID]*sgeo[baseM*Nsgeo + IJID]; + // // rescaling - missing factor of 2 ? (only impacts penalty and thus stiffness) A = L*h/2 => (J*2) = (sJ*2)*h/2 => h = 2*J/sJ + // dfloat hinvM = sgeo[baseM*Nsgeo + SJID]*sgeo[baseM*Nsgeo + IJID]; - href = mymax(hinvM,href); - } - } + // href = mymax(hinvM,href); + // } + // } - for(dlong e=0;e=0) ? (idP/Np):e; + if (eP<0) eP = eM; - int fP = EToF[baseM]; - fP = (fP==-1) ? f:fP; + int fP = EToF[eM*Nfaces+fM]; + if (fP<0) fP = fM; + dlong baseM = eM*Nfaces + fM; dlong baseP = eP*Nfaces + fP; // rescaling - A = L*h/2 => (J*2) = (sJ*2)*h/2 => h = 2*J/sJ - dfloat hinvM = 0.5*sgeo[baseM*Nsgeo + SJID]*sgeo[baseM*Nsgeo + IJID]; - dfloat hinvP = 0.5*sgeo[baseP*Nsgeo + SJID]*sgeo[baseP*Nsgeo + IJID]; - - sgeo[baseM*Nsgeo+IHID] = mymax(hinvM,hinvP); - sgeo[baseP*Nsgeo+IHID] = mymax(hinvM,hinvP); + dfloat hinvM = hinv[baseM]; + dfloat hinvP = hinv[baseP]; + sgeo[baseM*Nsgeo+IHID] = std::max(hinvM,hinvP); - if (EToB[f+e*Nfaces] > 0) { //enforce a stronger penalty on boundaries - sgeo[baseM*Nsgeo+IHID] = mymax(sgeo[baseM*Nsgeo+IHID],tol*href); - sgeo[baseP*Nsgeo+IHID] = mymax(sgeo[baseP*Nsgeo+IHID],tol*href); - } + // if (EToB[fM+eM*Nfaces] > 0) { //enforce a stronger penalty on boundaries + // sgeo[baseM*Nsgeo+IHID] *= 2; + // } #if 0 printf("e=%d f=%d (eP=%d,fP=%d) nx=%5.4f, ny=%5.4f, sJ=%5.4f, invJ=%5.4f, hinv=%f\n" ,e,f,eP,fP, @@ -140,4 +156,7 @@ void meshTri2D::SurfaceGeometricFactors(){ } } + o_sgeo = platform.malloc(sgeo); } + +} //namespace libp diff --git a/libs/mesh/meshSurfaceGeometricFactorsTri3D.cpp b/libs/mesh/meshSurfaceGeometricFactorsTri3D.cpp index cbbc5eeb0..c4e184ed2 100644 --- a/libs/mesh/meshSurfaceGeometricFactorsTri3D.cpp +++ b/libs/mesh/meshSurfaceGeometricFactorsTri3D.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,26 +25,43 @@ SOFTWARE. */ #include "mesh.hpp" -#include "mesh/mesh3D.hpp" + +namespace libp { /* compute outwards facing normals, surface Jacobian, and volume Jacobian for all face nodes */ -void meshTri3D::SurfaceGeometricFactors(){ +void mesh_t::SurfaceGeometricFactorsTri3D(){ /* unified storage array for geometric factors */ - Nsgeo = 14; - sgeo = (dfloat*) calloc((Nelements+totalHaloPairs)* - Nsgeo*Nfp*Nfaces, - sizeof(dfloat)); + Nsgeo = 6; + + NXID = 0; + NYID = 1; + NZID = 2; + SJID = 3; + IJID = 4; + IHID = 5; + + props["defines/" "p_Nsgeo"]= Nsgeo; + props["defines/" "p_NXID"]= NXID; + props["defines/" "p_NYID"]= NYID; + props["defines/" "p_NZID"]= NZID; + props["defines/" "p_SJID"]= SJID; + props["defines/" "p_IJID"]= IJID; + props["defines/" "p_IHID"]= IHID; + + sgeo.malloc(Nelements*Nsgeo*Nfp*Nfaces); - dfloat *xr = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *yr = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *zr = (dfloat*) calloc(Np, sizeof(dfloat)); + memory hinv((Nelements+totalHaloPairs)*Nfp*Nfaces); - dfloat *xs = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *ys = (dfloat*) calloc(Np, sizeof(dfloat)); - dfloat *zs = (dfloat*) calloc(Np, sizeof(dfloat)); + memory xr(Np); + memory yr(Np); + memory zr(Np); - dfloat *J = (dfloat*) calloc(Np, sizeof(dfloat)); + memory xs(Np); + memory ys(Np); + memory zs(Np); + + memory J (Np); for(int e=0;e(sgeo); + #if 0 for(int e=0;e +void ogs_t::GatherScatter(deviceMemory o_v, + const int k, + const Op op, + const Transpose trans){ + GatherScatterStart (o_v, k, op, trans); + GatherScatterFinish(o_v, k, op, trans); +} + +template +void ogs_t::GatherScatterStart(deviceMemory o_v, + const int k, + const Op op, + const Transpose trans){ + exchange->AllocBuffer(k*sizeof(T)); + + deviceMemory o_haloBuf = exchange->o_workspace; -// Host buffer versions -void ogs_t::GatherScatter (void *v, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::hostGatherScatter(v, 1, 1, 0, type, op, trans, *this); } + //collect halo buffer + gatherHalo->Gather(o_haloBuf, o_v, k, op, trans); -void ogs_t::GatherScatterVec (void *v, const int k, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::hostGatherScatter(v, k, 1, 0, type, op, trans, *this); } + if (exchange->gpu_aware) { + //prepare MPI exchange + exchange->Start(o_haloBuf, k, op, trans); + } else { + //get current stream + device_t &device = platform.device; + stream_t currentStream = device.getStream(); -void ogs_t::GatherScatterMany(void *v, const int k, const dlong stride, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::hostGatherScatter(v, 1, k, stride, type, op, trans, *this); } + pinnedMemory haloBuf = exchange->h_workspace; -void ogs_t::Gather (void *gv, void *v, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::hostGather(gv, v, 1, 1, 0, 0, type, op, trans, *this); } + //if not using gpu-aware mpi move the halo buffer to the host + const dlong Nhalo = (trans == NoTrans) ? NhaloP : NhaloT; -void ogs_t::GatherVec (void *gv, void *v, const int k, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::hostGather(gv, v, k, 1, 0, 0, type, op, trans, *this); } + //wait for o_haloBuf to be ready + device.finish(); + + //queue copy to host + device.setStream(dataStream); + haloBuf.copyFrom(o_haloBuf, Nhalo*k, + 0, properties_t("async", true)); + device.setStream(currentStream); + } +} -void ogs_t::GatherMany(void *gv, void *v, const int k, - const dlong gstride, const dlong stride, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::hostGather(gv, v, 1, k, gstride, stride, type, op, trans, *this); } +template +void ogs_t::GatherScatterFinish(deviceMemory o_v, + const int k, + const Op op, + const Transpose trans){ + + //queue local gs operation + gatherLocal->GatherScatter(o_v, k, op, trans); + + deviceMemory o_haloBuf = exchange->o_workspace; + + if (exchange->gpu_aware) { + //finish MPI exchange + exchange->Finish(o_haloBuf, k, op, trans); + } else { + pinnedMemory haloBuf = exchange->h_workspace; + + //get current stream + device_t &device = platform.device; + stream_t currentStream = device.getStream(); + + //synchronize data stream to ensure the buffer is on the host + device.setStream(dataStream); + device.finish(); + + /*MPI exchange of host buffer*/ + exchange->Start (haloBuf, k, op, trans); + exchange->Finish(haloBuf, k, op, trans); + + // copy recv back to device + const dlong Nhalo = (trans == Trans) ? NhaloP : NhaloT; + haloBuf.copyTo(o_haloBuf, Nhalo*k, + 0, properties_t("async", true)); + device.finish(); //wait for transfer to finish + device.setStream(currentStream); + } + + //write exchanged halo buffer back to vector + gatherHalo->Scatter(o_v, o_haloBuf, k, trans); +} -void ogs_t::Scatter (void *v, void *gv, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::hostScatter(v, gv, 1, 1, 0, 0, type, op, trans, *this); } +template +void ogs_t::GatherScatter(deviceMemory v, const int k, + const Op op, const Transpose trans); +template +void ogs_t::GatherScatter(deviceMemory v, const int k, + const Op op, const Transpose trans); +template +void ogs_t::GatherScatter(deviceMemory v, const int k, + const Op op, const Transpose trans); +template +void ogs_t::GatherScatter(deviceMemory v, const int k, + const Op op, const Transpose trans); + +/******************************** + * Host GatherScatter + ********************************/ +template +void ogs_t::GatherScatter(memory v, + const int k, + const Op op, + const Transpose trans){ + GatherScatterStart (v, k, op, trans); + GatherScatterFinish(v, k, op, trans); +} -void ogs_t::ScatterVec (void *v, void *gv, const int k, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::hostScatter(v, gv, k, 1, 0, 0, type, op, trans, *this); } +template +void ogs_t::GatherScatterStart(memory v, + const int k, + const Op op, + const Transpose trans){ + exchange->AllocBuffer(k*sizeof(T)); -void ogs_t::ScatterMany(void *v, void *gv, const int k, - const dlong stride, const dlong gstride, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::hostScatter(v, gv, 1, k, stride, gstride, type, op, trans, *this); } + /*Cast workspace to type T*/ + pinnedMemory haloBuf = exchange->h_workspace; + //collect halo buffer + gatherHalo->Gather(haloBuf, v, k, op, trans); -// Synchronous device buffer versions -void ogs_t::GatherScatter (occa::memory& o_v, - const ogs_type type, const ogs_op op, const ogs_transpose trans) { - ogs::occaGatherScatterStart (o_v, 1, 1, 0, type, op, trans, *this); - ogs::occaGatherScatterFinish(o_v, 1, 1, 0, type, op, trans, *this); + //prepare MPI exchange + exchange->Start(haloBuf, k, op, trans); } -void ogs_t::GatherScatterVec (occa::memory& o_v, const int k, - const ogs_type type, const ogs_op op, const ogs_transpose trans) { - ogs::occaGatherScatterStart (o_v, k, 1, 0, type, op, trans, *this); - ogs::occaGatherScatterFinish(o_v, k, 1, 0, type, op, trans, *this); +template +void ogs_t::GatherScatterFinish(memory v, + const int k, + const Op op, + const Transpose trans){ + + /*Cast workspace to type T*/ + pinnedMemory haloBuf = exchange->h_workspace; + + //queue local gs operation + gatherLocal->GatherScatter(v, k, op, trans); + + //finish MPI exchange + exchange->Finish(haloBuf, k, op, trans); + + //write exchanged halo buffer back to vector + gatherHalo->Scatter(v, haloBuf, k, trans); } -void ogs_t::GatherScatterMany(occa::memory& o_v, const int k, - const dlong stride, - const ogs_type type, const ogs_op op, const ogs_transpose trans) { - ogs::occaGatherScatterStart (o_v, 1, k, stride, type, op, trans, *this); - ogs::occaGatherScatterFinish(o_v, 1, k, stride, type, op, trans, *this); +template +void ogs_t::GatherScatter(memory v, const int k, + const Op op, const Transpose trans); +template +void ogs_t::GatherScatter(memory v, const int k, + const Op op, const Transpose trans); +template +void ogs_t::GatherScatter(memory v, const int k, + const Op op, const Transpose trans); +template +void ogs_t::GatherScatter(memory v, const int k, + const Op op, const Transpose trans); + +/******************************** + * Device Gather + ********************************/ +template +void ogs_t::Gather(deviceMemory o_gv, + deviceMemory o_v, + const int k, + const Op op, + const Transpose trans){ + GatherStart (o_gv, o_v, k, op, trans); + GatherFinish(o_gv, o_v, k, op, trans); } -void ogs_t::Gather (occa::memory& o_gv, occa::memory& o_v, - const ogs_type type, const ogs_op op, const ogs_transpose trans) { - ogs::occaGatherStart (o_gv, o_v, 1, 1, 0, 0, type, op, trans, *this); - ogs::occaGatherFinish(o_gv, o_v, 1, 1, 0, 0, type, op, trans, *this); +template +void ogs_t::GatherStart(deviceMemory o_gv, + deviceMemory o_v, + const int k, + const Op op, + const Transpose trans){ + AssertGatherDefined(); + + deviceMemory o_haloBuf = exchange->o_workspace; + + if (trans==Trans) { //if trans!=ogs::Trans theres no comms required + exchange->AllocBuffer(k*sizeof(T)); + + //collect halo buffer + gatherHalo->Gather(o_haloBuf, o_v, k, op, Trans); + + if (exchange->gpu_aware) { + //prepare MPI exchange + exchange->Start(o_haloBuf, k, op, Trans); + } else { + //get current stream + device_t &device = platform.device; + stream_t currentStream = device.getStream(); + + //if not using gpu-aware mpi move the halo buffer to the host + pinnedMemory haloBuf = exchange->h_workspace; + + //wait for o_haloBuf to be ready + device.finish(); + + //queue copy to host + device.setStream(dataStream); + haloBuf.copyFrom(o_haloBuf, NhaloT*k, + 0, properties_t("async", true)); + device.setStream(currentStream); + } + } else { + //gather halo + gatherHalo->Gather(o_gv + k*NlocalT, o_v, k, op, trans); + } } -void ogs_t::GatherVec (occa::memory& o_gv, occa::memory& o_v, const int k, - const ogs_type type, const ogs_op op, const ogs_transpose trans) { - ogs::occaGatherStart (o_gv, o_v, k, 1, 0, 0, type, op, trans, *this); - ogs::occaGatherFinish(o_gv, o_v, k, 1, 0, 0, type, op, trans, *this); +template +void ogs_t::GatherFinish(deviceMemory o_gv, + deviceMemory o_v, + const int k, + const Op op, + const Transpose trans){ + AssertGatherDefined(); + + deviceMemory o_haloBuf = exchange->o_workspace; + + //queue local g operation + gatherLocal->Gather(o_gv, o_v, k, op, trans); + + if (trans==Trans) { //if trans!=ogs::Trans theres no comms required + if (exchange->gpu_aware) { + //finish MPI exchange + exchange->Finish(o_haloBuf, k, op, Trans); + + //put the result at the end of o_gv + o_haloBuf.copyTo(o_gv + k*NlocalT, + k*NhaloP, 0, properties_t("async", true)); + } else { + pinnedMemory haloBuf = exchange->h_workspace; + + //get current stream + device_t &device = platform.device; + stream_t currentStream = device.getStream(); + + //synchronize data stream to ensure the buffer is on the host + device.setStream(dataStream); + device.finish(); + + /*MPI exchange of host buffer*/ + exchange->Start (haloBuf, k, op, trans); + exchange->Finish(haloBuf, k, op, trans); + + // copy recv back to device + //put the result at the end of o_gv + haloBuf.copyTo(o_gv + k*NlocalT, k*NhaloP, + 0, properties_t("async", true)); + device.finish(); //wait for transfer to finish + device.setStream(currentStream); + } + } } -void ogs_t::GatherMany(occa::memory& o_gv, occa::memory& o_v, const int k, - const dlong gstride, const dlong stride, - const ogs_type type, const ogs_op op, const ogs_transpose trans) { - ogs::occaGatherStart (o_gv, o_v, 1, k, gstride, stride, type, op, trans, *this); - ogs::occaGatherFinish(o_gv, o_v, 1, k, gstride, stride, type, op, trans, *this); +template +void ogs_t::Gather(deviceMemory v, const deviceMemory gv, + const int k, const Op op, const Transpose trans); +template +void ogs_t::Gather(deviceMemory v, const deviceMemory gv, + const int k, const Op op, const Transpose trans); +template +void ogs_t::Gather(deviceMemory v, const deviceMemory gv, + const int k, const Op op, const Transpose trans); +template +void ogs_t::Gather(deviceMemory v, const deviceMemory gv, + const int k, const Op op, const Transpose trans); + +/******************************** + * Host Gather + ********************************/ + +//host versions +template +void ogs_t::Gather(memory gv, + const memory v, + const int k, + const Op op, + const Transpose trans){ + GatherStart (gv, v, k, op, trans); + GatherFinish(gv, v, k, op, trans); } -void ogs_t::Scatter (occa::memory& o_v, occa::memory& o_gv, - const ogs_type type, const ogs_op op, const ogs_transpose trans) { - ogs::occaScatterStart (o_v, o_gv, 1, 1, 0, 0, type, op, trans, *this); - ogs::occaScatterFinish(o_v, o_gv, 1, 1, 0, 0, type, op, trans, *this); +template +void ogs_t::GatherStart(memory gv, + const memory v, + const int k, + const Op op, + const Transpose trans){ + AssertGatherDefined(); + + if (trans==Trans) { //if trans!=ogs::Trans theres no comms required + exchange->AllocBuffer(k*sizeof(T)); + + /*Cast workspace to type T*/ + pinnedMemory haloBuf = exchange->h_workspace; + + //collect halo buffer + gatherHalo->Gather(haloBuf, v, k, op, Trans); + + //prepare MPI exchange + exchange->Start(haloBuf, k, op, Trans); + } else { + //gather halo + gatherHalo->Gather(gv + k*NlocalT, v, k, op, trans); + } } -void ogs_t::ScatterVec (occa::memory& o_v, occa::memory& o_gv, const int k, - const ogs_type type, const ogs_op op, const ogs_transpose trans) { - ogs::occaScatterStart (o_v, o_gv, k, 1, 0, 0, type, op, trans, *this); - ogs::occaScatterFinish(o_v, o_gv, k, 1, 0, 0, type, op, trans, *this); +template +void ogs_t::GatherFinish(memory gv, + const memory v, + const int k, + const Op op, + const Transpose trans){ + AssertGatherDefined(); + + //queue local g operation + gatherLocal->Gather(gv, v, k, op, trans); + + if (trans==Trans) { //if trans!=ogs::Trans theres no comms required + /*Cast workspace to type T*/ + pinnedMemory haloBuf = exchange->h_workspace; + + //finish MPI exchange + exchange->Finish(haloBuf, k, op, Trans); + + //put the result at the end of o_gv + haloBuf.copyTo(gv+k*NlocalT, k*NhaloP); + } } -void ogs_t::ScatterMany(occa::memory& o_v, occa::memory& o_gv, const int k, - const dlong stride, const dlong gstride, - const ogs_type type, const ogs_op op, const ogs_transpose trans) { - ogs::occaScatterStart (o_v, o_gv, 1, k, stride, gstride, type, op, trans, *this); - ogs::occaScatterFinish(o_v, o_gv, 1, k, stride, gstride, type, op, trans, *this); +template +void ogs_t::Gather(memory v, const memory gv, + const int k, const Op op, const Transpose trans); +template +void ogs_t::Gather(memory v, const memory gv, + const int k, const Op op, const Transpose trans); +template +void ogs_t::Gather(memory v, const memory gv, + const int k, const Op op, const Transpose trans); +template +void ogs_t::Gather(memory v, const memory gv, + const int k, const Op op, const Transpose trans); + +/******************************** + * Device Scatter + ********************************/ +template +void ogs_t::Scatter(deviceMemory o_v, + deviceMemory o_gv, + const int k, + const Transpose trans){ + ScatterStart (o_v, o_gv, k, trans); + ScatterFinish(o_v, o_gv, k, trans); } -// Asynchronous device buffer versions -void ogs_t::GatherScatterStart (occa::memory& o_v, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::occaGatherScatterStart (o_v, 1, 1, 0, type, op, trans, *this); } +template +void ogs_t::ScatterStart(deviceMemory o_v, + deviceMemory o_gv, + const int k, + const Transpose trans){ + AssertGatherDefined(); -void ogs_t::GatherScatterFinish (occa::memory& o_v, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::occaGatherScatterFinish(o_v, 1, 1, 0, type, op, trans, *this); } + deviceMemory o_haloBuf = exchange->o_workspace; -void ogs_t::GatherScatterVecStart (occa::memory& o_v, const int k, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::occaGatherScatterStart (o_v, k, 1, 0, type, op, trans, *this); } + if (trans==NoTrans) { //if trans!=ogs::NoTrans theres no comms required + exchange->AllocBuffer(k*sizeof(T)); -void ogs_t::GatherScatterVecFinish (occa::memory& o_v, const int k, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::occaGatherScatterFinish(o_v, k, 1, 0, type, op, trans, *this); } + device_t &device = platform.device; -void ogs_t::GatherScatterManyStart (occa::memory& o_v, const int k, const dlong stride, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::occaGatherScatterStart (o_v, 1, k, stride, type, op, trans, *this); } + if (exchange->gpu_aware) { + //collect halo buffer + o_haloBuf.copyFrom(o_gv + k*NlocalT, + k*NhaloP, 0, properties_t("async", true)); -void ogs_t::GatherScatterManyFinish(occa::memory& o_v, const int k, const dlong stride, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::occaGatherScatterFinish(o_v, 1, k, stride, type, op, trans, *this); } + //wait for o_haloBuf to be ready + device.finish(); -void ogs_t::GatherStart (occa::memory& o_gv, occa::memory& o_v, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::occaGatherStart (o_gv, o_v, 1, 1, 0, 0, type, op, trans, *this); } + //prepare MPI exchange + exchange->Start(o_haloBuf, k, Add, NoTrans); + } else { + //get current stream + stream_t currentStream = device.getStream(); -void ogs_t::GatherFinish (occa::memory& o_gv, occa::memory& o_v, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::occaGatherFinish(o_gv, o_v, 1, 1, 0, 0, type, op, trans, *this); } + //if not using gpu-aware mpi move the halo buffer to the host + pinnedMemory haloBuf = exchange->h_workspace; -void ogs_t::GatherVecStart (occa::memory& o_gv, occa::memory& o_v, const int k, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::occaGatherStart (o_gv, o_v, k, 1, 0, 0, type, op, trans, *this); } + //wait for o_gv to be ready + device.finish(); -void ogs_t::GatherVecFinish (occa::memory& o_gv, occa::memory& o_v, const int k, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::occaGatherFinish(o_gv, o_v, k, 1, 0, 0, type, op, trans, *this); } + //queue copy to host + device.setStream(dataStream); + haloBuf.copyFrom(o_gv + k*NlocalT, NhaloP*k, + 0, properties_t("async", true)); + device.setStream(currentStream); + } + } +} -void ogs_t::GatherManyStart (occa::memory& o_gv, occa::memory& o_v, const int k, - const dlong gstride, const dlong stride, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::occaGatherStart (o_gv, o_v, 1, k, gstride, stride, type, op, trans, *this); } +template +void ogs_t::ScatterFinish(deviceMemory o_v, + deviceMemory o_gv, + const int k, + const Transpose trans){ + AssertGatherDefined(); + + deviceMemory o_haloBuf = exchange->o_workspace; + + //queue local s operation + gatherLocal->Scatter(o_v, o_gv, k, trans); + + if (trans==NoTrans) { //if trans!=ogs::NoTrans theres no comms required + if (exchange->gpu_aware) { + //finish MPI exchange + exchange->Finish(o_haloBuf, k, Add, NoTrans); + } else { + pinnedMemory haloBuf = exchange->h_workspace; + + //get current stream + device_t &device = platform.device; + stream_t currentStream = device.getStream(); + + //synchronize data stream to ensure the buffer is on the host + device.setStream(dataStream); + device.finish(); + + /*MPI exchange of host buffer*/ + exchange->Start (haloBuf, k, Add, NoTrans); + exchange->Finish(haloBuf, k, Add, NoTrans); + + // copy recv back to device + haloBuf.copyTo(o_haloBuf, NhaloT*k, + 0, properties_t("async", true)); + device.finish(); //wait for transfer to finish + device.setStream(currentStream); + } + + //scatter halo buffer + gatherHalo->Scatter(o_v, o_haloBuf, k, NoTrans); + } else { + //scatter halo + gatherHalo->Scatter(o_v, o_gv + k*NlocalT, k, trans); + } +} -void ogs_t::GatherManyFinish(occa::memory& o_gv, occa::memory& o_v, const int k, - const dlong gstride, const dlong stride, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::occaGatherFinish(o_gv, o_v, 1, k, gstride, stride, type, op, trans, *this); } +template +void ogs_t::Scatter(deviceMemory v, const deviceMemory gv, + const int k, const Transpose trans); +template +void ogs_t::Scatter(deviceMemory v, const deviceMemory gv, + const int k, const Transpose trans); +template +void ogs_t::Scatter(deviceMemory v, const deviceMemory gv, + const int k, const Transpose trans); +template +void ogs_t::Scatter(deviceMemory v, const deviceMemory gv, + const int k, const Transpose trans); + +/******************************** + * Host Scatter + ********************************/ + +//host versions +template +void ogs_t::Scatter(memory v, + const memory gv, + const int k, + const Transpose trans){ + ScatterStart (v, gv, k, trans); + ScatterFinish(v, gv, k, trans); +} -void ogs_t::ScatterStart (occa::memory& o_v, occa::memory& o_gv, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::occaScatterStart (o_v, o_gv, 1, 1, 0, 0, type, op, trans, *this); } +template +void ogs_t::ScatterStart(memory v, + const memory gv, + const int k, + const Transpose trans){ + AssertGatherDefined(); -void ogs_t::ScatterFinish (occa::memory& o_v, occa::memory& o_gv, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::occaScatterFinish(o_v, o_gv, 1, 1, 0, 0, type, op, trans, *this); } + if (trans==NoTrans) { //if trans!=ogs::NoTrans theres no comms required + exchange->AllocBuffer(k*sizeof(T)); -void ogs_t::ScatterVecStart (occa::memory& o_v, occa::memory& o_gv, const int k, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::occaScatterStart (o_v, o_gv, k, 1, 0, 0, type, op, trans, *this); } + /*Cast workspace to type T*/ + pinnedMemory haloBuf = exchange->h_workspace; -void ogs_t::ScatterVecFinish (occa::memory& o_v, occa::memory& o_gv, const int k, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::occaScatterFinish(o_v, o_gv, k, 1, 0, 0, type, op, trans, *this); } + //collect halo buffer + haloBuf.copyFrom(gv + k*NlocalT, k*NhaloP); -void ogs_t::ScatterManyStart (occa::memory& o_v, occa::memory& o_gv, const int k, - const dlong stride, const dlong gstride, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::occaScatterStart (o_v, o_gv, 1, k, stride, gstride, type, op, trans, *this); } + //prepare MPI exchange + exchange->Start(haloBuf, k, Add, NoTrans); + } +} -void ogs_t::ScatterManyFinish(occa::memory& o_v, occa::memory& o_gv, const int k, - const dlong stride, const dlong gstride, - const ogs_type type, const ogs_op op, const ogs_transpose trans) -{ ogs::occaScatterFinish(o_v, o_gv, 1, k, stride, gstride, type, op, trans, *this); } +template +void ogs_t::ScatterFinish(memory v, + const memory gv, + const int k, + const Transpose trans){ + AssertGatherDefined(); + + //queue local s operation + gatherLocal->Scatter(v, gv, k, trans); + + if (trans==NoTrans) { //if trans!=ogs::NoTrans theres no comms required + /*Cast workspace to type T*/ + pinnedMemory haloBuf = exchange->h_workspace; + + //finish MPI exchange (and put the result at the end of o_gv) + exchange->Finish(haloBuf, k, Add, NoTrans); + + //scatter halo buffer + gatherHalo->Scatter(v, haloBuf, k, NoTrans); + } else { + //scatter halo + gatherHalo->Scatter(v, gv + k*NlocalT, k, trans); + } +} -void ogs_t::Unique(hlong *ids, dlong _N, MPI_Comm _comm) { - ogs::gsUnique(ids, _N, _comm); -} \ No newline at end of file +template +void ogs_t::Scatter(memory v, const memory gv, + const int k, const Transpose trans); +template +void ogs_t::Scatter(memory v, const memory gv, + const int k, const Transpose trans); +template +void ogs_t::Scatter(memory v, const memory gv, + const int k, const Transpose trans); +template +void ogs_t::Scatter(memory v, const memory gv, + const int k, const Transpose trans); +} //namespace ogs + +} //namespace libp diff --git a/libs/ogs/ogsAllToAll.cpp b/libs/ogs/ogsAllToAll.cpp new file mode 100644 index 000000000..051ffa174 --- /dev/null +++ b/libs/ogs/ogsAllToAll.cpp @@ -0,0 +1,358 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "ogs.hpp" +#include "ogs/ogsUtils.hpp" +#include "ogs/ogsExchange.hpp" + +#ifdef GLIBCXX_PARALLEL +#include +using __gnu_parallel::sort; +#else +using std::sort; +#endif + +namespace libp { + +namespace ogs { + +/********************************** +* Host exchange +***********************************/ +template +inline void ogsAllToAll_t::Start(pinnedMemory &buf, const int k, + const Op op, const Transpose trans){ + + pinnedMemory sendBuf = h_sendspace; + + // extract the send buffer + if (trans == NoTrans) + extract(NsendN, k, sendIdsN, buf, sendBuf); + else + extract(NsendT, k, sendIdsT, buf, sendBuf); + + if (trans==NoTrans) { + for (int r=0;r +inline void ogsAllToAll_t::Finish(pinnedMemory &buf, const int k, + const Op op, const Transpose trans){ + + comm.Wait(request); + + //if we recvieved anything via MPI, gather the recv buffer and scatter + // it back to to original vector + dlong Nrecv = recvOffsets[size]; + if (Nrecv) { + // gather the recieved nodes + postmpi.Gather(buf, buf, k, op, trans); + } +} + +void ogsAllToAll_t::Start(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsAllToAll_t::Start(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsAllToAll_t::Start(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsAllToAll_t::Start(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsAllToAll_t::Finish(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsAllToAll_t::Finish(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsAllToAll_t::Finish(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsAllToAll_t::Finish(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } + +/********************************** +* GPU-aware exchange +***********************************/ +template +void ogsAllToAll_t::Start(deviceMemory &o_buf, + const int k, + const Op op, + const Transpose trans){ + + const dlong Nsend = (trans == NoTrans) ? NsendN : NsendT; + + if (Nsend) { + deviceMemory o_sendBuf = o_sendspace; + + // assemble the send buffer on device + if (trans == NoTrans) { + extractKernel[ogsType::get()](NsendN, k, o_sendIdsN, o_buf, o_sendBuf); + } else { + extractKernel[ogsType::get()](NsendT, k, o_sendIdsT, o_buf, o_sendBuf); + } + //wait for kernel to finish on default stream + device_t &device = platform.device; + device.finish(); + } +} + +template +void ogsAllToAll_t::Finish(deviceMemory &o_buf, + const int k, + const Op op, + const Transpose trans){ + + deviceMemory o_sendBuf = o_sendspace; + + if (trans==NoTrans) { + for (int r=0;r &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsAllToAll_t::Start(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsAllToAll_t::Start(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsAllToAll_t::Start(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsAllToAll_t::Finish(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsAllToAll_t::Finish(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsAllToAll_t::Finish(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsAllToAll_t::Finish(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } + +ogsAllToAll_t::ogsAllToAll_t(dlong Nshared, + memory &sharedNodes, + ogsOperator_t& gatherHalo, + stream_t _dataStream, + comm_t _comm, + platform_t &_platform): + ogsExchange_t(_platform,_comm, _dataStream) { + + Nhalo = gatherHalo.NrowsT; + NhaloP = gatherHalo.NrowsN; + + // sort the list by rank to the order where they will be sent by MPI_Allgatherv + sort(sharedNodes.ptr(), sharedNodes.ptr()+Nshared, + [](const parallelNode_t& a, const parallelNode_t& b) { + if(a.rank < b.rank) return true; //group by rank + if(a.rank > b.rank) return false; + + return a.newId < b.newId; //then order by the localId relative to this rank + }); + + //make mpi allgatherv counts and offsets + mpiSendCountsT.calloc(size); + mpiSendCountsN.calloc(size); + mpiRecvCountsT.malloc(size); + mpiRecvCountsN.malloc(size); + mpiSendOffsetsT.malloc(size+1); + mpiSendOffsetsN.malloc(size+1); + mpiRecvOffsetsN.malloc(size+1); + mpiRecvOffsetsT.malloc(size+1); + + for (dlong n=0;n0) mpiSendCountsN[r]++; + mpiSendCountsT[r]++; + } + + //shared counts + comm.Alltoall(mpiSendCountsT, mpiRecvCountsT); + comm.Alltoall(mpiSendCountsN, mpiRecvCountsN); + + //cumulative sum + mpiSendOffsetsN[0] = 0; + mpiSendOffsetsT[0] = 0; + mpiRecvOffsetsN[0] = 0; + mpiRecvOffsetsT[0] = 0; + for (int r=0;r recvNodes(Nrecv); + + //Send list of nodes to each rank + comm.Alltoallv(sharedNodes, mpiSendCountsT, mpiSendOffsetsT, + recvNodes, mpiRecvCountsT, mpiRecvOffsetsT); + + //make ops for gathering halo nodes after an MPI_Allgatherv + postmpi.platform = platform; + postmpi.kind = Signed; + + postmpi.NrowsN = Nhalo; + postmpi.NrowsT = Nhalo; + postmpi.rowStartsN.malloc(Nhalo+1); + postmpi.rowStartsT.malloc(Nhalo+1); + + //make array of counters + memory haloGatherTCounts(Nhalo); + memory haloGatherNCounts(Nhalo); + + //count the data that will already be in h_haloBuf.ptr() + for (dlong n=0;n(postmpi.nnzT*Nbytes); + o_workspace = platform.malloc(postmpi.nnzT*Nbytes); + } + if (o_sendspace.size() < NsendT*Nbytes) { + h_sendspace = platform.hostMalloc(NsendT*Nbytes); + o_sendspace = platform.malloc(NsendT*Nbytes); + } +} + +} //namespace ogs + +} //namespace libp diff --git a/libs/ogs/ogsAuto.cpp b/libs/ogs/ogsAuto.cpp new file mode 100644 index 000000000..42868e969 --- /dev/null +++ b/libs/ogs/ogsAuto.cpp @@ -0,0 +1,349 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "ogs.hpp" +#include "ogs/ogsUtils.hpp" +#include "ogs/ogsOperator.hpp" +#include "ogs/ogsExchange.hpp" +#include "timer.hpp" + +namespace libp { + +namespace ogs { + +static void DeviceExchangeTest(ogsExchange_t* exchange, double time[3]) { + const int Ncold = 10; + const int Nhot = 10; + double localTime, sumTime, minTime, maxTime; + + comm_t& comm = exchange->comm; + int size = comm.size(); + + pinnedMemory buf = exchange->h_workspace; + deviceMemory o_buf = exchange->o_workspace; + + device_t &device = exchange->platform.device; + + //dry run + for (int n=0;ngpu_aware) { + /*GPU-aware exchange*/ + exchange->Start (o_buf, 1, Add, Sym); + exchange->Finish(o_buf, 1, Add, Sym); + } else { + //if not using gpu-aware mpi move the halo buffer to the host + o_buf.copyTo(buf, exchange->Nhalo, + 0, properties_t("async", true)); + device.finish(); + + /*MPI exchange of host buffer*/ + exchange->Start (buf, 1, Add, Sym); + exchange->Finish(buf, 1, Add, Sym); + + // copy recv back to device + o_buf.copyFrom(buf, exchange->Nhalo, + 0, properties_t("async", true)); + device.finish(); //wait for transfer to finish + } + } + + //hot runs + timePoint_t start = Time(); + for (int n=0;ngpu_aware) { + /*GPU-aware exchange*/ + exchange->Start (o_buf, 1, Add, Sym); + exchange->Finish(o_buf, 1, Add, Sym); + } else { + //if not using gpu-aware mpi move the halo buffer to the host + o_buf.copyTo(buf, exchange->Nhalo, + 0, properties_t("async", true)); + device.finish(); + + /*MPI exchange of host buffer*/ + exchange->Start (buf, 1, Add, Sym); + exchange->Finish(buf, 1, Add, Sym); + + // copy recv back to device + o_buf.copyFrom(buf, exchange->Nhalo, + 0, properties_t("async", true)); + device.finish(); //wait for transfer to finish + } + } + timePoint_t end = Time(); + + localTime = ElapsedTime(start,end)/Nhot; + comm.Allreduce(localTime, sumTime, Comm::Sum); + comm.Allreduce(localTime, maxTime, Comm::Max); + comm.Allreduce(localTime, minTime, Comm::Min); + + time[0] = sumTime/size; //avg + time[1] = minTime; //min + time[2] = maxTime; //max +} + +static void HostExchangeTest(ogsExchange_t* exchange, double time[3]) { + const int Ncold = 10; + const int Nhot = 10; + double localTime, sumTime, minTime, maxTime; + + comm_t& comm = exchange->comm; + int size = comm.size(); + + pinnedMemory buf = exchange->h_workspace; + + //dry run + for (int n=0;nStart (buf, 1, Add, Sym); + exchange->Finish(buf, 1, Add, Sym); + } + + //hot runs + timePoint_t start = Time(); + for (int n=0;nStart (buf, 1, Add, Sym); + exchange->Finish(buf, 1, Add, Sym); + } + timePoint_t end = Time(); + + localTime = ElapsedTime(start,end)/Nhot; + comm.Allreduce(localTime, sumTime, Comm::Sum); + comm.Allreduce(localTime, maxTime, Comm::Max); + comm.Allreduce(localTime, minTime, Comm::Min); + + time[0] = sumTime/size; //avg + time[1] = minTime; //min + time[2] = maxTime; //max +} + +ogsExchange_t* ogsBase_t::AutoSetup(dlong Nshared, + memory &sharedNodes, + ogsOperator_t& _gatherHalo, + comm_t _comm, + platform_t &_platform, + const int verbose) { + + int rank, size; + rank = comm.rank(); + size = comm.size(); + + if (size==1) return new ogsPairwise_t(Nshared, sharedNodes, + _gatherHalo, dataStream, + comm, platform); + + ogsExchange_t* bestExchange; + Method method; + double bestTime; + +#ifdef GPU_AWARE_MPI + if (rank==0 && verbose) + printf(" Method Device Exchange (avg, min, max) Device Exchange (GPU-aware) Host Exchange \n"); +#else + if (rank==0 && verbose) + printf(" Method Device Exchange (avg, min, max) Host Exchange \n"); +#endif + + //Trigger JIT kernel builds + InitializeKernels(platform, ogs::Dfloat, ogs::Add); + + /******************************** + * Pairwise + ********************************/ + ogsExchange_t* pairwise = new ogsPairwise_t(Nshared, sharedNodes, + _gatherHalo, dataStream, + comm, platform); + + //standard copy to host - exchange - copy back to device + pairwise->gpu_aware=false; + + double pairwiseTime[3]; + DeviceExchangeTest(pairwise, pairwiseTime); + double pairwiseAvg = pairwiseTime[0]; + +#ifdef GPU_AWARE_MPI + //test GPU-aware exchange + pairwise->gpu_aware=true; + + double pairwiseGATime[3]; + DeviceExchangeTest(pairwise, pairwiseGATime); + + if (pairwiseGATime[0] < pairwiseAvg) + pairwiseAvg = pairwiseGATime[0]; + else + pairwise->gpu_aware=false; + +#endif + + //test exchange from host memory (just for reporting) + double pairwiseHostTime[3]; + HostExchangeTest(pairwise, pairwiseHostTime); + + bestExchange = pairwise; + method = Pairwise; + bestTime = pairwiseAvg; + +#ifdef GPU_AWARE_MPI + if (rank==0 && verbose) + printf(" Pairwise %5.3e %5.3e %5.3e %5.3e %5.3e %5.3e %5.3e %5.3e %5.3e \n", + pairwiseTime[0], pairwiseTime[1], pairwiseTime[2], + pairwiseGATime[0], pairwiseGATime[1], pairwiseGATime[2], + pairwiseHostTime[0], pairwiseHostTime[1], pairwiseHostTime[2]); +#else + if (rank==0 && verbose) + printf(" Pairwise %5.3e %5.3e %5.3e %5.3e %5.3e %5.3e \n", + pairwiseTime[0], pairwiseTime[1], pairwiseTime[2], + pairwiseHostTime[0], pairwiseHostTime[1], pairwiseHostTime[2]); +#endif + + /******************************** + * All-to-All + ********************************/ + ogsExchange_t* alltoall = new ogsAllToAll_t(Nshared, sharedNodes, + _gatherHalo, dataStream, + comm, platform); + //standard copy to host - exchange - copy back to device + alltoall->gpu_aware=false; + + double alltoallTime[3]; + DeviceExchangeTest(alltoall, alltoallTime); + double alltoallAvg = alltoallTime[0]; + +#ifdef GPU_AWARE_MPI + //test GPU-aware exchange + alltoall->gpu_aware=true; + + double alltoallGATime[3]; + DeviceExchangeTest(alltoall, alltoallGATime); + + if (alltoallGATime[0] < alltoallAvg) + alltoallAvg = alltoallGATime[0]; + else + alltoall->gpu_aware=false; + +#endif + + //test exchange from host memory (just for reporting) + double alltoallHostTime[3]; + HostExchangeTest(alltoall, alltoallHostTime); + + if (alltoallAvg < bestTime) { + delete bestExchange; + bestExchange = alltoall; + method = AllToAll; + bestTime = alltoallAvg; + } else { + delete alltoall; + } + +#ifdef GPU_AWARE_MPI + if (rank==0 && verbose) + printf(" AllToAll %5.3e %5.3e %5.3e %5.3e %5.3e %5.3e %5.3e %5.3e %5.3e \n", + alltoallTime[0], alltoallTime[1], alltoallTime[2], + alltoallGATime[0], alltoallGATime[1], alltoallGATime[2], + alltoallHostTime[0], alltoallHostTime[1], alltoallHostTime[2]); +#else + if (rank==0 && verbose) + printf(" AllToAll %5.3e %5.3e %5.3e %5.3e %5.3e %5.3e \n", + alltoallTime[0], alltoallTime[1], alltoallTime[2], + alltoallHostTime[0], alltoallHostTime[1], alltoallHostTime[2]); +#endif + + /******************************** + * Crystal Router + ********************************/ + ogsExchange_t* crystal = new ogsCrystalRouter_t(Nshared, sharedNodes, + _gatherHalo, dataStream, + comm, platform); + + //standard copy to host - exchange - copy back to device + crystal->gpu_aware=false; + + double crystalTime[3]; + DeviceExchangeTest(crystal, crystalTime); + double crystalAvg = crystalTime[0]; + +#ifdef GPU_AWARE_MPI + //test GPU-aware exchange + crystal->gpu_aware=true; + + double crystalGATime[3]; + DeviceExchangeTest(crystal, crystalGATime); + + if (crystalGATime[0] < crystalAvg) + crystalAvg = crystalGATime[0]; + else + crystal->gpu_aware=false; + +#endif + + //test exchange from host memory (just for reporting) + double crystalHostTime[3]; + HostExchangeTest(crystal, crystalHostTime); + + if (crystalAvg < bestTime) { + delete bestExchange; + bestExchange = crystal; + method = CrystalRouter; + bestTime = crystalAvg; + } else { + delete crystal; + } + +#ifdef GPU_AWARE_MPI + if (rank==0 && verbose) + printf(" CrystalRouter %5.3e %5.3e %5.3e %5.3e %5.3e %5.3e %5.3e %5.3e %5.3e \n", + crystalTime[0], crystalTime[1], crystalTime[2], + crystalGATime[0], crystalGATime[1], crystalGATime[2], + crystalHostTime[0], crystalHostTime[1], crystalHostTime[2]); +#else + if (rank==0 && verbose) + printf(" CrystalRouter %5.3e %5.3e %5.3e %5.3e %5.3e %5.3e \n", + crystalTime[0], crystalTime[1], crystalTime[2], + crystalHostTime[0], crystalHostTime[1], crystalHostTime[2]); +#endif + + if (rank==0 && verbose) { + switch (method) { + case AllToAll: + printf(" Exchange method selected: AllToAll"); break; + case Pairwise: + printf(" Exchange method selected: Pairwise"); break; + case CrystalRouter: + printf(" Exchange method selected: CrystalRouter"); break; + default: + break; + } + if (bestExchange->gpu_aware) printf(" (GPU-aware)"); + printf("\n"); + } + + return bestExchange; +} + + +} //namespace ogs + +} //namespace libp diff --git a/libs/ogs/ogsCrystalRouter.cpp b/libs/ogs/ogsCrystalRouter.cpp new file mode 100644 index 000000000..f62c836ea --- /dev/null +++ b/libs/ogs/ogsCrystalRouter.cpp @@ -0,0 +1,775 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "ogs.hpp" +#include "ogs/ogsUtils.hpp" +#include "ogs/ogsExchange.hpp" + +#ifdef GLIBCXX_PARALLEL +#include +using __gnu_parallel::sort; +#else +using std::sort; +#endif + +namespace libp { + +namespace ogs { + +/********************************** +* Host exchange +***********************************/ +template +inline void ogsCrystalRouter_t::Start(pinnedMemory &buf, const int k, + const Op op, const Transpose trans){} + +template +inline void ogsCrystalRouter_t::Finish(pinnedMemory &buf, const int k, + const Op op, const Transpose trans){ + + + memory levels; + if (trans==NoTrans) { + levels = levelsN; + } else { + levels = levelsT; + } + + pinnedMemory sendBuf = h_sendspace; + + // To start, buf = h_workspace = h_work[(hbuf_id+0)%2]; + // sendBuf = h_sendspace; + for (int l=0;l recvBuf = h_workspace; + + //post recvs + if (levels[l].Nmsg>0) { + comm.Irecv(recvBuf + levels[l].recvOffset*k, + levels[l].partner, + k*levels[l].Nrecv0, + levels[l].partner, + request[1]); + } + if (levels[l].Nmsg==2) { + comm.Irecv(recvBuf + levels[l].recvOffset*k + levels[l].Nrecv0*k, + rank-1, + k*levels[l].Nrecv1, + rank-1, + request[2]); + } + + //assemble send buffer + extract(levels[l].Nsend, k, levels[l].sendIds, buf, sendBuf); + + //post send + comm.Isend(sendBuf, + levels[l].partner, + k*levels[l].Nsend, + rank, + request[0]); + + comm.Waitall(levels[l].Nmsg+1, request); + + //rotate buffers + h_workspace = h_work[(hbuf_id+1)%2]; + hbuf_id = (hbuf_id+1)%2; + + recvBuf = buf; + buf = h_workspace; + + //Gather the recv'd values into the haloBuffer + levels[l].gather.Gather(buf, recvBuf, k, op, Trans); + } +} + +void ogsCrystalRouter_t::Start(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsCrystalRouter_t::Start(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsCrystalRouter_t::Start(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsCrystalRouter_t::Start(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsCrystalRouter_t::Finish(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsCrystalRouter_t::Finish(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsCrystalRouter_t::Finish(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsCrystalRouter_t::Finish(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } + +/********************************** +* GPU-aware exchange +***********************************/ +template +inline void ogsCrystalRouter_t::Start(deviceMemory &o_buf, + const int k, + const Op op, + const Transpose trans){ +} + +template +inline void ogsCrystalRouter_t::Finish(deviceMemory &o_buf, + const int k, + const Op op, + const Transpose trans){ + + device_t &device = platform.device; + + //get current stream + stream_t currentStream = device.getStream(); + + //the intermediate kernels are always overlapped with the default stream + device.setStream(dataStream); + + memory levels; + if (trans==NoTrans) { + levels = levelsN; + } else { + levels = levelsT; + } + + deviceMemory o_sendBuf = o_sendspace; + + // To start, o_buf = o_workspace = o_work[(buf_id+0)%2]; + // o_sendBuf = o_sendspace + for (int l=0;l o_recvBuf = o_workspace; + + //post recvs + if (levels[l].Nmsg>0) { + comm.Irecv(o_recvBuf + levels[l].recvOffset*k, + levels[l].partner, + k*levels[l].Nrecv0, + levels[l].partner, + request[1]); + } + if (levels[l].Nmsg==2) { + comm.Irecv(o_recvBuf + levels[l].recvOffset*k + levels[l].Nrecv0*k, + rank-1, + k*levels[l].Nrecv1, + rank-1, + request[2]); + } + + //assemble send buffer + if (levels[l].Nsend) { + extractKernel[ogsType::get()](levels[l].Nsend, k, + levels[l].o_sendIds, + o_buf, o_sendBuf); + device.finish(); + } + + //post send + comm.Isend(o_sendBuf, + levels[l].partner, + k*levels[l].Nsend, + rank, + request[0]); + + comm.Waitall(levels[l].Nmsg+1, request); + + //rotate buffers + o_workspace = o_work[(buf_id+1)%2]; + buf_id = (buf_id+1)%2; + + o_recvBuf = o_buf; + o_buf = o_workspace; + + //Gather the recv'd values into the haloBuffer + levels[l].gather.Gather(o_buf, o_recvBuf, k, op, Trans); + } + + device.setStream(currentStream); +} + +void ogsCrystalRouter_t::Start(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsCrystalRouter_t::Start(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsCrystalRouter_t::Start(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsCrystalRouter_t::Start(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsCrystalRouter_t::Finish(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsCrystalRouter_t::Finish(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsCrystalRouter_t::Finish(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsCrystalRouter_t::Finish(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } + + +/* + *Crystal Router performs the needed MPI communcation via recursive + * folding of a hypercube. Consider a set of NP ranks. We select a + * pivot point n_half=(NP+1)/2, and pair all ranks r=n_half (called the hi half), as follows + * + * 0 <--> NP-1 + * 1 <--> NP-2 + * 2 <--> NP-3 + * * * * + * n_half-2 <--> NP-n_half+1 + * n_half-1 <--> NP-n_half + * + * The communication can then be summarized thusly: if a rank in the lo + * half has data needed by *any* rank in the hi half, it sends this data + * to its hi partner, and analogously for ranks in the hi half. Each rank + * therefore sends/receives a single message to/from its partner. + * + * The communication then proceeds recursively, applying the same folding + * proceedure to the lo and hi halves seperately, and stopping when the size + * of the local NP reaches 1. + * + * In the case where NP is odd, n_half-1 == NP-n_half and rank n_half-1 has + * no partner to communicate with. In this case, we assign rank r to the + * lo half of ranks, and rank n_half-1 sends its data to rank n_half (and + * receives no message, as rank n_half-2 is receiving all rank n_half's data). + + * To perform the Crystal Router exchange, each rank gathers its halo nodes to + * a coalesced buffer. At each step in the crystal router, a send buffer is + * gathered from this buffer and sent to the rank's partner. Simultaneously, a + * buffer is received from the rank's partner. This receive buffer is scattered + * and added into the coalesced halo buffer. After all commincation is complete + * the halo nodes are scattered back to the output array. + */ + +ogsCrystalRouter_t::ogsCrystalRouter_t(dlong Nshared, + memory &sharedNodes, + ogsOperator_t& gatherHalo, + stream_t _dataStream, + comm_t _comm, + platform_t &_platform): + ogsExchange_t(_platform,_comm,_dataStream) { + + NhaloP = gatherHalo.NrowsN; + Nhalo = gatherHalo.NrowsT; + + //first count how many levels we need + Nlevels = 0; + int np = size; + int np_offset=0; + while (np>1) { + int np_half = (np+1)/2; + int r_half = np_half + np_offset; + + int is_lo = (rank nodes(N); + + //setup is easier if we include copies of the nodes we own + // in the list of shared nodes + for(dlong n=0;n1) { + int np_half = (np+1)/2; + int r_half = np_half + np_offset; + + int is_lo = (rank0) + comm.Irecv(Nrecv0, partner, partner, request[1]); + if (Nmsg==2) + comm.Irecv(Nrecv1, r_half-1, r_half-1, request[2]); + + comm.Waitall(Nmsg+1, request); + + int Nrecv = Nrecv0+Nrecv1; + + //make room for the nodes we'll recv + if (is_lo) Nlo+=Nrecv; + else Nhi+=Nrecv; + + //split node list in two + memory loNodes(Nlo); + memory hiNodes(Nhi); + + Nlo=0, Nhi=0; + for (dlong n=0;n sendNodes = is_lo ? hiNodes : loNodes; + + //count how many entries from the halo buffer we're sending + int NentriesSendN=0; + int NentriesSendT=0; + for (dlong n=0;n0) NentriesSendN++; + NentriesSendT++; + } + } + levelsN[Nlevels].Nsend = NentriesSendN; + levelsT[Nlevels].Nsend = NentriesSendT; + levelsN[Nlevels].sendIds.malloc(NentriesSendN); + levelsT[Nlevels].sendIds.malloc(NentriesSendT); + + NentriesSendN=0; //reset + NentriesSendT=0; //reset + for (dlong n=0;n0) + levelsN[Nlevels].sendIds[NentriesSendN++] = sendNodes[n].newId; + + levelsT[Nlevels].sendIds[NentriesSendT++] = sendNodes[n].newId; + } + sendNodes[n].newId = -1; //wipe the newId before sending + } + levelsT[Nlevels].o_sendIds = platform.malloc(levelsT[Nlevels].sendIds); + levelsN[Nlevels].o_sendIds = platform.malloc(levelsN[Nlevels].sendIds); + + //share the entry count with our partner + comm.Isend(NentriesSendT, partner, rank, request[0]); + + int NentriesRecvT0=0, NentriesRecvT1=0; + if (Nmsg>0) + comm.Irecv(NentriesRecvT0, partner, partner, request[1]); + if (Nmsg==2) + comm.Irecv(NentriesRecvT1, r_half-1, r_half-1, request[2]); + + comm.Waitall(Nmsg+1, request); + + levelsT[Nlevels].Nrecv0 = NentriesRecvT0; + levelsT[Nlevels].Nrecv1 = NentriesRecvT1; + levelsT[Nlevels].recvOffset = NhaloExtT; + + comm.Isend(NentriesSendN, partner, rank, request[0]); + + int NentriesRecvN0=0, NentriesRecvN1=0; + if (Nmsg>0) + comm.Irecv(NentriesRecvN0, partner, partner, request[1]); + if (Nmsg==2) + comm.Irecv(NentriesRecvN1, r_half-1, r_half-1, request[2]); + + comm.Waitall(Nmsg+1, request); + + levelsN[Nlevels].Nrecv0 = NentriesRecvN0; + levelsN[Nlevels].Nrecv1 = NentriesRecvN1; + levelsN[Nlevels].recvOffset = NhaloExtN; + + //space needed in recv buffer for this level + dlong buf_size = NhaloExtT + NentriesRecvT0 + NentriesRecvT1; + haloBuf_size = (buf_size > haloBuf_size) ? buf_size : haloBuf_size; + + + //send half the list to our partner + comm.Isend(sendNodes, partner, Nsend, rank, request[0]); + + //recv new nodes from our partner(s) + if (Nmsg>0) + comm.Irecv(nodes+offset, partner, Nrecv0, partner, request[1]); + if (Nmsg==2) + comm.Irecv(nodes+offset+Nrecv0, r_half-1, Nrecv1, r_half-1, request[2]); + + comm.Waitall(Nmsg+1, request); + + sendNodes.free(); + + //We now have a list of nodes who's destinations are in our half + // of the hypercube + //We now build the gather into the haloBuffer + + + //record the current order + for (dlong n=0;n abs(b.baseId)) return false; + + return a.newId > b.newId; //positive newIds first + }); + + //find how many positive ids there will be in the extended halo + dlong start = 0; + NhaloExtN=0; + NhaloExtT=0; + for (dlong n=0;n= Nhalo || id==-1) { + for (dlong i=start;i0) { + NhaloExtN++; + break; + } + } + NhaloExtT++; + } + start = end; + } + } + + + //make an index map to save the original extended halo ids + memory indexMap(NhaloExtT); + + //fill newIds of new entries if possible, or give them an index + NhaloExtT = Nhalo + NhaloExtN; + NhaloExtN = Nhalo; + start = 0; + for (dlong n=0;n= Nhalo || id==-1) { + int sign = -2; + for (dlong i=start;i0) { + sign = nodes[i].sign; + break; + } + } + + if (sign>0) + id = NhaloExtN++; + else + id = NhaloExtT++; + + //save the orignal id + indexMap[id-Nhalo] = nodes[start].newId; + } + + //write id into this baseId group + for (dlong i=start;i= Nhalo) { + if (nodes[n].sign >0) gatherN.rowStartsT[id+1]++; + gatherT.rowStartsT[id+1]++; + } + } + } + + //look through first message for nodes to gather + for (dlong n=offset;n0) gatherN.rowStartsT[id+1]++; + gatherT.rowStartsT[id+1]++; + } + } + //look through second message for nodes to gather + for (dlong n=offset+Nrecv0;n0) gatherN.rowStartsT[id+1]++; + gatherT.rowStartsT[id+1]++; + } + } + + for (dlong i=0;i= Nhalo) { + if (nodes[n].sign > 0) { + gatherN.colIdsT[gatherN.rowStartsT[id]++] = indexMap[id-Nhalo]; + } + gatherT.colIdsT[gatherT.rowStartsT[id]++] = indexMap[id-Nhalo]; + } + } + } + + indexMap.free(); + + dlong NentriesRecvN=levelsN[Nlevels].recvOffset; + dlong NentriesRecvT=levelsT[Nlevels].recvOffset; + //look through first message for nodes to gatherT + for (dlong n=offset;n 0) { + gatherN.colIdsT[gatherN.rowStartsT[id]++] = NentriesRecvN++; + } + gatherT.colIdsT[gatherT.rowStartsT[id]++] = NentriesRecvT++; + } + } + //look through second message for nodes to gatherT + for (dlong n=offset+Nrecv0;n 0) { + gatherN.colIdsT[gatherN.rowStartsT[id]++] = NentriesRecvN++; + } + gatherT.colIdsT[gatherT.rowStartsT[id]++] = NentriesRecvT++; + } + } + + //reset row starts + for (dlong i=gatherT.NrowsT;i>0;--i) { + gatherT.rowStartsT[i] = gatherT.rowStartsT[i-1]; + gatherN.rowStartsT[i] = gatherN.rowStartsT[i-1]; + } + gatherT.rowStartsT[0] = 0; + gatherN.rowStartsT[0] = 0; + + gatherT.o_rowStartsT = platform.malloc(gatherT.rowStartsT); + gatherT.o_rowStartsN = gatherT.o_rowStartsT; + gatherN.o_rowStartsT = platform.malloc(gatherN.rowStartsT); + gatherN.o_rowStartsN = gatherN.o_rowStartsT; + gatherT.o_colIdsT = platform.malloc(gatherT.colIdsT); + gatherT.o_colIdsN = gatherT.o_colIdsT; + gatherN.o_colIdsT = platform.malloc(gatherN.colIdsT); + gatherN.o_colIdsN = gatherN.o_colIdsT; + + gatherN.setupRowBlocks(); + gatherT.setupRowBlocks(); + + levelsT[Nlevels].gather = gatherT; + levelsN[Nlevels].gather = gatherN; + + //sort the new node list by newId + sort(nodes.ptr(), nodes.ptr()+N, + [](const parallelNode_t& a, const parallelNode_t& b) { + return a.newId < b.newId; //group by newId (which also groups by abs(baseId)) + }); + + //propagate the sign of recvieved nodes + start = 0; + for (dlong n=0;n0) { + for (dlong j=start;j1) nodes.free(); + + NsendMax=0, NrecvMax=0; + for (int k=0;kNsendMax) ? Nsend : NsendMax; + int Nrecv = levelsT[k].recvOffset + + levelsT[k].Nrecv0 + levelsT[k].Nrecv1; + NrecvMax = (Nrecv>NrecvMax) ? Nrecv : NrecvMax; + } + + //make scratch space + AllocBuffer(sizeof(dfloat)); +} + +void ogsCrystalRouter_t::AllocBuffer(size_t Nbytes) { + + if (o_sendspace.size() < NsendMax*Nbytes) { + h_sendspace = platform.hostMalloc(NsendMax*Nbytes); + o_sendspace = platform.malloc(NsendMax*Nbytes); + } + if (o_work[0].size() < NrecvMax*Nbytes) { + h_work[0] = platform.hostMalloc(NrecvMax*Nbytes); + h_work[1] = platform.hostMalloc(NrecvMax*Nbytes); + h_workspace = h_work[0]; + hbuf_id=0; + + o_work[0] = platform.malloc(NrecvMax*Nbytes); + o_work[1] = platform.malloc(NrecvMax*Nbytes); + o_workspace = o_work[0]; + buf_id=0; + } +} + +} //namespace ogs + +} //namespace libp diff --git a/libs/ogs/ogsHalo.cpp b/libs/ogs/ogsHalo.cpp new file mode 100644 index 000000000..7e9743531 --- /dev/null +++ b/libs/ogs/ogsHalo.cpp @@ -0,0 +1,395 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "ogs.hpp" +#include "ogs/ogsUtils.hpp" +#include "ogs/ogsOperator.hpp" +#include "ogs/ogsExchange.hpp" + +namespace libp { + +namespace ogs { + +/******************************** + * Device Exchange + ********************************/ +template +void halo_t::Exchange(deviceMemory o_v, const int k) { + ExchangeStart (o_v, k); + ExchangeFinish(o_v, k); +} + +template +void halo_t::ExchangeStart(deviceMemory o_v, const int k){ + exchange->AllocBuffer(k*sizeof(T)); + + deviceMemory o_haloBuf = exchange->o_workspace; + + if (exchange->gpu_aware) { + if (gathered_halo) { + //if this halo was build from a gathered ogs the halo nodes are at the end + o_haloBuf.copyFrom(o_v + k*NlocalT, k*NhaloP, + 0, properties_t("async", true)); + } else { + //collect halo buffer + gatherHalo->Gather(o_haloBuf, o_v, k, Add, NoTrans); + } + + //prepare MPI exchange + exchange->Start(o_haloBuf, k, Add, NoTrans); + + } else { + //get current stream + device_t &device = platform.device; + stream_t currentStream = device.getStream(); + + //if not using gpu-aware mpi move the halo buffer to the host + pinnedMemory haloBuf = exchange->h_workspace; + + if (gathered_halo) { + //wait for o_v to be ready + device.finish(); + + //queue copy to host + device.setStream(dataStream); + haloBuf.copyFrom(o_v + k*NlocalT, NhaloP*k, + 0, properties_t("async", true)); + device.setStream(currentStream); + } else { + //collect halo buffer + gatherHalo->Gather(o_haloBuf, o_v, k, Add, NoTrans); + + //wait for o_haloBuf to be ready + device.finish(); + + //queue copy to host + device.setStream(dataStream); + haloBuf.copyFrom(o_haloBuf, NhaloP*k, + 0, properties_t("async", true)); + device.setStream(currentStream); + } + } +} + +template +void halo_t::ExchangeFinish(deviceMemory o_v, const int k){ + + deviceMemory o_haloBuf = exchange->o_workspace; + + //write exchanged halo buffer back to vector + if (exchange->gpu_aware) { + //finish MPI exchange + exchange->Finish(o_haloBuf, k, Add, NoTrans); + + if (gathered_halo) { + o_haloBuf.copyTo(o_v + k*(NlocalT+NhaloP), k*Nhalo, + k*NhaloP, properties_t("async", true)); + } else { + gatherHalo->Scatter(o_v, o_haloBuf, k, NoTrans); + } + } else { + pinnedMemory haloBuf = exchange->h_workspace; + + //get current stream + device_t &device = platform.device; + stream_t currentStream = device.getStream(); + + //synchronize data stream to ensure the buffer is on the host + device.setStream(dataStream); + device.finish(); + + /*MPI exchange of host buffer*/ + exchange->Start (haloBuf, k, Add, NoTrans); + exchange->Finish(haloBuf, k, Add, NoTrans); + + // copy recv back to device + if (gathered_halo) { + haloBuf.copyTo(o_v + k*(NlocalT+NhaloP), k*Nhalo, + k*NhaloP, properties_t("async", true)); + device.finish(); //wait for transfer to finish + device.setStream(currentStream); + } else { + haloBuf.copyTo(o_haloBuf+k*NhaloP, k*Nhalo, + k*NhaloP, properties_t("async", true)); + device.finish(); //wait for transfer to finish + device.setStream(currentStream); + + gatherHalo->Scatter(o_v, o_haloBuf, k, NoTrans); + } + } +} + +template void halo_t::ExchangeStart(deviceMemory o_v, const int k); +template void halo_t::ExchangeStart(deviceMemory o_v, const int k); +template void halo_t::ExchangeStart(deviceMemory o_v, const int k); +template void halo_t::ExchangeStart(deviceMemory o_v, const int k); +template void halo_t::ExchangeFinish(deviceMemory o_v, const int k); +template void halo_t::ExchangeFinish(deviceMemory o_v, const int k); +template void halo_t::ExchangeFinish(deviceMemory o_v, const int k); +template void halo_t::ExchangeFinish(deviceMemory o_v, const int k); +template void halo_t::Exchange(deviceMemory o_v, const int k); +template void halo_t::Exchange(deviceMemory o_v, const int k); +template void halo_t::Exchange(deviceMemory o_v, const int k); +template void halo_t::Exchange(deviceMemory o_v, const int k); + +//host version +template +void halo_t::Exchange(memory v, const int k) { + ExchangeStart (v, k); + ExchangeFinish(v, k); +} + +template +void halo_t::ExchangeStart(memory v, const int k) { + exchange->AllocBuffer(k*sizeof(T)); + + pinnedMemory haloBuf = exchange->h_workspace; + + //collect halo buffer + if (gathered_halo) { + //if this halo was build from a gathered ogs the halo nodes are at the end + haloBuf.copyFrom(v + k*NlocalT, k*NhaloP); + } else { + gatherHalo->Gather(haloBuf, v, k, Add, NoTrans); + } + + //Prepare MPI exchange + exchange->Start(haloBuf, k, Add, NoTrans); +} + +template +void halo_t::ExchangeFinish(memory v, const int k) { + + pinnedMemory haloBuf = exchange->h_workspace; + + //finish MPI exchange + exchange->Finish(haloBuf, k, Add, NoTrans); + + //write exchanged halo buffer back to vector + if (gathered_halo) { + //if this halo was build from a gathered ogs the halo nodes are at the end + haloBuf.copyTo(v + k*(NlocalT+NhaloP), + k*Nhalo, + k*NhaloP); + } else { + gatherHalo->Scatter(v, haloBuf, k, NoTrans); + } +} + +template void halo_t::ExchangeStart(memory v, const int k); +template void halo_t::ExchangeStart(memory v, const int k); +template void halo_t::ExchangeStart(memory v, const int k); +template void halo_t::ExchangeStart(memory v, const int k); +template void halo_t::ExchangeFinish(memory v, const int k); +template void halo_t::ExchangeFinish(memory v, const int k); +template void halo_t::ExchangeFinish(memory v, const int k); +template void halo_t::ExchangeFinish(memory v, const int k); +template void halo_t::Exchange(memory v, const int k); +template void halo_t::Exchange(memory v, const int k); +template void halo_t::Exchange(memory v, const int k); +template void halo_t::Exchange(memory v, const int k); + +/******************************** + * Combine + ********************************/ +template +void halo_t::Combine(deviceMemory o_v, const int k) { + CombineStart (o_v, k); + CombineFinish(o_v, k); +} + +template +void halo_t::CombineStart(deviceMemory o_v, const int k){ + exchange->AllocBuffer(k*sizeof(T)); + + deviceMemory o_haloBuf = exchange->o_workspace; + + if (exchange->gpu_aware) { + if (gathered_halo) { + //if this halo was build from a gathered ogs the halo nodes are at the end + o_haloBuf.copyFrom(o_v + k*NlocalT, k*NhaloT, + 0, properties_t("async", true)); + } else { + //collect halo buffer + gatherHalo->Gather(o_haloBuf, o_v, k, Add, Trans); + } + + //prepare MPI exchange + exchange->Start(o_haloBuf, k, Add, Trans); + } else { + //get current stream + device_t &device = platform.device; + stream_t currentStream = device.getStream(); + + //if not using gpu-aware mpi move the halo buffer to the host + pinnedMemory haloBuf = exchange->h_workspace; + + if (gathered_halo) { + //wait for o_v to be ready + device.finish(); + + //queue copy to host + device.setStream(dataStream); + haloBuf.copyFrom(o_v + k*NlocalT, NhaloT*k, + 0, properties_t("async", true)); + device.setStream(currentStream); + } else { + //collect halo buffer + gatherHalo->Gather(o_haloBuf, o_v, k, Add, Trans); + + //wait for o_haloBuf to be ready + device.finish(); + + //queue copy to host + device.setStream(dataStream); + haloBuf.copyFrom(o_haloBuf, NhaloT*k, + 0, properties_t("async", true)); + device.setStream(currentStream); + } + } +} + +template +void halo_t::CombineFinish(deviceMemory o_v, const int k){ + + deviceMemory o_haloBuf = exchange->o_workspace; + + //write exchanged halo buffer back to vector + if (exchange->gpu_aware) { + //finish MPI exchange + exchange->Finish(o_haloBuf, k, Add, Trans); + + if (gathered_halo) { + //if this halo was build from a gathered ogs the halo nodes are at the end + o_haloBuf.copyTo(o_v + k*NlocalT, k*NhaloP, + 0, properties_t("async", true)); + } else { + gatherHalo->Scatter(o_v, o_haloBuf, k, Trans); + } + } else { + pinnedMemory haloBuf = exchange->h_workspace; + + //get current stream + device_t &device = platform.device; + stream_t currentStream = device.getStream(); + + //synchronize data stream to ensure the buffer is on the host + device.setStream(dataStream); + device.finish(); + + /*MPI exchange of host buffer*/ + exchange->Start (haloBuf, k, Add, Trans); + exchange->Finish(haloBuf, k, Add, Trans); + + if (gathered_halo) { + // copy recv back to device + haloBuf.copyTo(o_v + k*NlocalT, NhaloP*k, + 0, properties_t("async", true)); + device.finish(); //wait for transfer to finish + device.setStream(currentStream); + } else { + haloBuf.copyTo(o_haloBuf, NhaloP*k, + 0, properties_t("async", true)); + device.finish(); //wait for transfer to finish + device.setStream(currentStream); + + gatherHalo->Scatter(o_v, o_haloBuf, k, Trans); + } + } +} + +template void halo_t::CombineStart(deviceMemory o_v, const int k); +template void halo_t::CombineStart(deviceMemory o_v, const int k); +template void halo_t::CombineStart(deviceMemory o_v, const int k); +template void halo_t::CombineStart(deviceMemory o_v, const int k); +template void halo_t::CombineFinish(deviceMemory o_v, const int k); +template void halo_t::CombineFinish(deviceMemory o_v, const int k); +template void halo_t::CombineFinish(deviceMemory o_v, const int k); +template void halo_t::CombineFinish(deviceMemory o_v, const int k); +template void halo_t::Combine(deviceMemory o_v, const int k); +template void halo_t::Combine(deviceMemory o_v, const int k); +template void halo_t::Combine(deviceMemory o_v, const int k); +template void halo_t::Combine(deviceMemory o_v, const int k); + +//host version +template +void halo_t::Combine(memory v, const int k) { + CombineStart (v, k); + CombineFinish(v, k); +} + +template +void halo_t::CombineStart(memory v, const int k) { + exchange->AllocBuffer(k*sizeof(T)); + + pinnedMemory haloBuf = exchange->h_workspace; + + //collect halo buffer + if (gathered_halo) { + //if this halo was build from a gathered ogs the halo nodes are at the end + haloBuf.copyFrom(v + k*NlocalT, k*NhaloT); + } else { + gatherHalo->Gather(haloBuf, v, k, Add, Trans); + } + + //Prepare MPI exchange + exchange->Start(haloBuf, k, Add, Trans); +} + + +template +void halo_t::CombineFinish(memory v, const int k) { + + pinnedMemory haloBuf = exchange->h_workspace; + + //finish MPI exchange + exchange->Finish(haloBuf, k, Add, Trans); + + //write exchanged halo buffer back to vector + if (gathered_halo) { + //if this halo was build from a gathered ogs the halo nodes are at the end + haloBuf.copyTo(v + k*NlocalT, k*NhaloP); + } else { + gatherHalo->Scatter(v, haloBuf, k, Trans); + } +} + +template void halo_t::CombineStart(memory v, const int k); +template void halo_t::CombineStart(memory v, const int k); +template void halo_t::CombineStart(memory v, const int k); +template void halo_t::CombineStart(memory v, const int k); +template void halo_t::CombineFinish(memory v, const int k); +template void halo_t::CombineFinish(memory v, const int k); +template void halo_t::CombineFinish(memory v, const int k); +template void halo_t::CombineFinish(memory v, const int k); +template void halo_t::Combine(memory v, const int k); +template void halo_t::Combine(memory v, const int k); +template void halo_t::Combine(memory v, const int k); +template void halo_t::Combine(memory v, const int k); + +} //namespace ogs + +} //namespace libp diff --git a/libs/ogs/ogsKernels.cpp b/libs/ogs/ogsKernels.cpp deleted file mode 100644 index de33ecd45..000000000 --- a/libs/ogs/ogsKernels.cpp +++ /dev/null @@ -1,132 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - - -#include "ogs.hpp" -#include "ogs/ogsKernels.hpp" - -//convert a macro command into a string -#define _STR(x) #x -#define STR(x) _STR(x) - -namespace ogs { - - //NC: Hard code these for now. Should be sufficient for GPU devices, but needs attention for CPU - const int blockSize = 256; - const int gatherNodesPerBlock = 1024; //should be a multiple of blockSize for good unrolling - - int Nrefs = 0; - - occa::stream dataStream; - -#define DEFINE_GATHERSCATTER_KERNEL(T,OP) \ - occa::kernel gatherScatterKernel_##T##_##OP; - -#define DEFINE_GATHER_KERNEL(T,OP) \ - occa::kernel gatherKernel_##T##_##OP; - -#define DEFINE_SCATTER_KERNEL(T) \ - occa::kernel scatterKernel_##T; - -#define DEFINE_KERNELS(T) \ - OGS_FOR_EACH_OP(T,DEFINE_GATHERSCATTER_KERNEL) \ - OGS_FOR_EACH_OP(T,DEFINE_GATHER_KERNEL) \ - DEFINE_SCATTER_KERNEL(T) - -OGS_FOR_EACH_TYPE(DEFINE_KERNELS) - - -void initKernels(platform_t& platform) { - - int rank = platform.rank; - - dataStream = platform.device.createStream(); - - occa::properties kernelInfo = platform.props; - - kernelInfo["defines/p_blockSize"] = blockSize; - kernelInfo["defines/p_gatherNodesPerBlock"] = gatherNodesPerBlock; - -#define DEFINE_OCCA_ADD_INIT(T) \ - kernelInfo["defines/init_" STR(T) "_add"] = (T) 0; \ - kernelInfo["defines/init_" STR(T) "_mul"] = (T) 1; \ - kernelInfo["defines/init_" STR(T) "_min"] = (T) std::numeric_limits::max(); \ - kernelInfo["defines/init_" STR(T) "_max"] = (T) -std::numeric_limits::max(); - -//OCCA properties don't have an operator+ for long long int, so alias it to int64_t -typedef int64_t long_long; - OGS_FOR_EACH_TYPE(DEFINE_OCCA_ADD_INIT) - - kernelInfo["includes"] += LIBP_DIR "/include/ogs/ogsDefs.h"; - - if (rank==0) {printf("Compiling GatherScatter Kernels...");fflush(stdout);} - -#define DEFINE_GATHERSCATTER_BUILD(T,OP) \ - gatherScatterKernel_##T##_##OP = platform.buildKernel(OGS_DIR "/okl/gatherScatter.okl",\ - "gatherScatter_" STR(T) "_" STR(OP), \ - kernelInfo); \ - -#define DEFINE_GATHER_BUILD(T,OP) \ - gatherKernel_##T##_##OP = platform.buildKernel(OGS_DIR "/okl/gatherScatter.okl", \ - "gather_" STR(T) "_" STR(OP), \ - kernelInfo); \ - -#define DEFINE_SCATTER_BUILD(T) \ - scatterKernel_##T = platform.buildKernel(OGS_DIR "/okl/gatherScatter.okl", \ - "scatter_" STR(T), \ - kernelInfo); \ - -#define DEFINE_BUILD(T) \ - OGS_FOR_EACH_OP(T,DEFINE_GATHERSCATTER_BUILD) \ - OGS_FOR_EACH_OP(T,DEFINE_GATHER_BUILD) \ - DEFINE_SCATTER_BUILD(T) - - OGS_FOR_EACH_TYPE(DEFINE_BUILD) - - if(rank==0) printf("done.\n"); -} - -void freeKernels() { - -#define DEFINE_GATHERSCATTER_FREE(T,OP) \ - gatherScatterKernel_##T##_##OP.free(); - -#define DEFINE_GATHER_FREE(T,OP) \ - gatherKernel_##T##_##OP.free(); - -#define DEFINE_SCATTER_FREE(T) \ - scatterKernel_##T.free(); - -#define DEFINE_FREE(T) \ - OGS_FOR_EACH_OP(T,DEFINE_GATHERSCATTER_FREE) \ - OGS_FOR_EACH_OP(T,DEFINE_GATHER_FREE) \ - DEFINE_SCATTER_FREE(T) - - OGS_FOR_EACH_TYPE(DEFINE_FREE) -} - -} //namespace ogs - diff --git a/libs/ogs/ogsOperator.cpp b/libs/ogs/ogsOperator.cpp new file mode 100644 index 000000000..19a496060 --- /dev/null +++ b/libs/ogs/ogsOperator.cpp @@ -0,0 +1,635 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include +#include "ogs.hpp" +#include "ogs/ogsUtils.hpp" +#include "ogs/ogsOperator.hpp" + +namespace libp { + +namespace ogs { + +template +struct Op_Add { + inline const T init() const { return T{0}; } + inline void operator()(T& gv, const T v) const { gv += v; } +}; +template +struct Op_Mul { + inline const T init() const { return T{1}; } + inline void operator()(T& gv, const T v) const { gv *= v; } +}; +template +struct Op_Max { + inline const T init() const { return -std::numeric_limits::max(); } + inline void operator()(T& gv, const T v) const { gv = (v>gv) ? v : gv; } +}; +template +struct Op_Min { + inline const T init() const {return std::numeric_limits::max(); } + inline void operator()(T& gv, const T v) const { gv = (v class U, + template class V, + template class Op, + typename T> +void ogsOperator_t::Gather(U gv, + const V v, + const int K, + const Transpose trans) { + + dlong Nrows; + dlong *__restrict__ rowStarts, *__restrict__ colIds; + if (trans==NoTrans) { + Nrows = NrowsN; + rowStarts = rowStartsN.ptr(); + colIds = colIdsN.ptr(); + } else { + Nrows = NrowsT; + rowStarts = rowStartsT.ptr(); + colIds = colIdsT.ptr(); + } + + const T*__restrict__ v_ptr = v.ptr(); + T*__restrict__ gv_ptr = gv.ptr(); + + const Op op; + + if (K==1) { + #pragma omp parallel for + for(dlong n=0;n class U, + template class V, + typename T> +void ogsOperator_t::Gather(U gv, + const V v, + const int k, + const Op op, + const Transpose trans) { + switch (op){ + case Add: + Gather(gv, v, k, trans); break; + case Mul: + Gather(gv, v, k, trans); break; + case Max: + Gather(gv, v, k, trans); break; + case Min: + Gather(gv, v, k, trans); break; + } +} + +template +void ogsOperator_t::Gather(memory gv, const memory v, + const int k, const Op op, const Transpose trans); +template +void ogsOperator_t::Gather(memory gv, const memory v, + const int k, const Op op, const Transpose trans); +template +void ogsOperator_t::Gather(memory gv, const memory v, + const int k, const Op op, const Transpose trans); +template +void ogsOperator_t::Gather(memory gv, const memory v, + const int k, const Op op, const Transpose trans); + +template +void ogsOperator_t::Gather(pinnedMemory gv, const memory v, + const int k, const Op op, const Transpose trans); +template +void ogsOperator_t::Gather(pinnedMemory gv, const memory v, + const int k, const Op op, const Transpose trans); +template +void ogsOperator_t::Gather(pinnedMemory gv, const memory v, + const int k, const Op op, const Transpose trans); +template +void ogsOperator_t::Gather(pinnedMemory gv, const memory v, + const int k, const Op op, const Transpose trans); + +template +void ogsOperator_t::Gather(pinnedMemory gv, const pinnedMemory v, + const int k, const Op op, const Transpose trans); +template +void ogsOperator_t::Gather(pinnedMemory gv, const pinnedMemory v, + const int k, const Op op, const Transpose trans); +template +void ogsOperator_t::Gather(pinnedMemory gv, const pinnedMemory v, + const int k, const Op op, const Transpose trans); +template +void ogsOperator_t::Gather(pinnedMemory gv, const pinnedMemory v, + const int k, const Op op, const Transpose trans); + + +template +void ogsOperator_t::Gather(deviceMemory o_gv, + deviceMemory o_v, + const int k, + const Op op, + const Transpose trans) { + constexpr Type type = ogsType::get(); + InitializeKernels(platform, type, op); + + if (trans==NoTrans) { + if (NrowBlocksN) + gatherKernel[type][op](NrowBlocksN, + k, + o_blockRowStartsN, + o_rowStartsN, + o_colIdsN, + o_v, + o_gv); + } else { + if (NrowBlocksT) + gatherKernel[type][op](NrowBlocksT, + k, + o_blockRowStartsT, + o_rowStartsT, + o_colIdsT, + o_v, + o_gv); + } +} + +template +void ogsOperator_t::Gather(deviceMemory gv, const deviceMemory v, + const int k, const Op op, const Transpose trans); +template +void ogsOperator_t::Gather(deviceMemory gv, const deviceMemory v, + const int k, const Op op, const Transpose trans); +template +void ogsOperator_t::Gather(deviceMemory gv, const deviceMemory v, + const int k, const Op op, const Transpose trans); +template +void ogsOperator_t::Gather(deviceMemory gv, const deviceMemory v, + const int k, const Op op, const Transpose trans); + + +/******************************** + * Scatter Operation + ********************************/ +template class U, + template class V, + typename T> +void ogsOperator_t::Scatter(U v, const V gv, + const int K, const Transpose trans) { + + dlong Nrows; + dlong *__restrict__ rowStarts, *__restrict__ colIds; + if (trans==Trans) { + Nrows = NrowsN; + rowStarts = rowStartsN.ptr(); + colIds = colIdsN.ptr(); + } else { + Nrows = NrowsT; + rowStarts = rowStartsT.ptr(); + colIds = colIdsT.ptr(); + } + + T*__restrict__ v_ptr = v.ptr(); + const T*__restrict__ gv_ptr = gv.ptr(); + + if (K==1) { + #pragma omp parallel for + for(dlong n=0;n v, const memory gv, + const int K, const Transpose trans); +template +void ogsOperator_t::Scatter(memory v, const memory gv, + const int K, const Transpose trans); +template +void ogsOperator_t::Scatter(memory v, const memory gv, + const int K, const Transpose trans); +template +void ogsOperator_t::Scatter(memory v, const memory gv, + const int K, const Transpose trans); + +template +void ogsOperator_t::Scatter(memory v, const pinnedMemory gv, + const int K, const Transpose trans); +template +void ogsOperator_t::Scatter(memory v, const pinnedMemory gv, + const int K, const Transpose trans); +template +void ogsOperator_t::Scatter(memory v, const pinnedMemory gv, + const int K, const Transpose trans); +template +void ogsOperator_t::Scatter(memory v, const pinnedMemory gv, + const int K, const Transpose trans); + +template +void ogsOperator_t::Scatter(deviceMemory o_v, + deviceMemory o_gv, + const int k, + const Transpose trans) { + constexpr Type type = ogsType::get(); + InitializeKernels(platform, type, Add); + + if (trans==Trans) { + if (NrowBlocksN) + scatterKernel[type](NrowBlocksN, + k, + o_blockRowStartsN, + o_rowStartsN, + o_colIdsN, + o_gv, + o_v); + } else { + if (NrowBlocksT) + scatterKernel[type](NrowBlocksT, + k, + o_blockRowStartsT, + o_rowStartsT, + o_colIdsT, + o_gv, + o_v); + } +} + +template +void ogsOperator_t::Scatter(deviceMemory v, const deviceMemory gv, + const int K, const Transpose trans); +template +void ogsOperator_t::Scatter(deviceMemory v, const deviceMemory gv, + const int K, const Transpose trans); +template +void ogsOperator_t::Scatter(deviceMemory v, const deviceMemory gv, + const int K, const Transpose trans); +template +void ogsOperator_t::Scatter(deviceMemory v, const deviceMemory gv, + const int K, const Transpose trans); + +/******************************** + * GatherScatter Operation + ********************************/ +template class U, + template class Op, + typename T> +void ogsOperator_t::GatherScatter(U v, const int K, + const Transpose trans) { + + dlong Nrows; + dlong *__restrict__ gRowStarts, *__restrict__ gColIds; + dlong *__restrict__ sRowStarts, *__restrict__ sColIds; + + if (trans==Trans) { + Nrows = NrowsN; + gRowStarts = rowStartsT.ptr(); + gColIds = colIdsT.ptr(); + sRowStarts = rowStartsN.ptr(); + sColIds = colIdsN.ptr(); + } else if (trans==Sym) { + Nrows = NrowsT; + gRowStarts = rowStartsT.ptr(); + gColIds = colIdsT.ptr(); + sRowStarts = rowStartsT.ptr(); + sColIds = colIdsT.ptr(); + } else { + Nrows = NrowsT; + gRowStarts = rowStartsN.ptr(); + gColIds = colIdsN.ptr(); + sRowStarts = rowStartsT.ptr(); + sColIds = colIdsT.ptr(); + } + + T*__restrict__ v_ptr = v.ptr(); + + const Op op; + + if (K==1) { + #pragma omp parallel for + for(dlong n=0;n class U, + typename T> +void ogsOperator_t::GatherScatter(U v, + const int k, + const Op op, + const Transpose trans) { + switch (op){ + case Add: + GatherScatter(v, k, trans); break; + case Mul: + GatherScatter(v, k, trans); break; + case Max: + GatherScatter(v, k, trans); break; + case Min: + GatherScatter(v, k, trans); break; + } +} + +template +void ogsOperator_t::GatherScatter(memory v,const int k, + const Op op, const Transpose trans); +template +void ogsOperator_t::GatherScatter(memory v,const int k, + const Op op, const Transpose trans); +template +void ogsOperator_t::GatherScatter(memory v,const int k, + const Op op, const Transpose trans); +template +void ogsOperator_t::GatherScatter(memory v,const int k, + const Op op, const Transpose trans); + +template +void ogsOperator_t::GatherScatter(deviceMemory o_v, + const int k, + const Op op, + const Transpose trans) { + constexpr Type type = ogsType::get(); + InitializeKernels(platform, type, Add); + + if (trans==Trans) { + if (NrowBlocksT) + gatherScatterKernel[type][Add](NrowBlocksT, + k, + o_blockRowStartsT, + o_rowStartsT, + o_colIdsT, + o_rowStartsN, + o_colIdsN, + o_v); + } else if (trans==Sym) { + if (NrowBlocksT) + gatherScatterKernel[type][Add](NrowBlocksT, + k, + o_blockRowStartsT, + o_rowStartsT, + o_colIdsT, + o_rowStartsT, + o_colIdsT, + o_v); + } else { + if (NrowBlocksT) + gatherScatterKernel[type][Add](NrowBlocksT, + k, + o_blockRowStartsT, + o_rowStartsN, + o_colIdsN, + o_rowStartsT, + o_colIdsT, + o_v); + } +} + +template +void ogsOperator_t::GatherScatter(deviceMemory v,const int k, + const Op op, const Transpose trans); +template +void ogsOperator_t::GatherScatter(deviceMemory v,const int k, + const Op op, const Transpose trans); +template +void ogsOperator_t::GatherScatter(deviceMemory v,const int k, + const Op op, const Transpose trans); +template +void ogsOperator_t::GatherScatter(deviceMemory v,const int k, + const Op op, const Transpose trans); + +void ogsOperator_t::setupRowBlocks() { + + dlong blockSumN=0, blockSumT=0; + NrowBlocksN=0, NrowBlocksT=0; + + if (NrowsN) NrowBlocksN++; + if (NrowsT) NrowBlocksT++; + + for (dlong i=0;i gatherNodesPerBlock); + LIBP_ABORT("Multiplicity of global node id: " << i + << " in ogsOperator_t::setupRowBlocks is too large.", + rowSizeT > gatherNodesPerBlock); + + if (blockSumN+rowSizeN > gatherNodesPerBlock) { //adding this row will exceed the nnz per block + NrowBlocksN++; //count the previous block + blockSumN=rowSizeN; //start a new row block + } else { + blockSumN+=rowSizeN; //add this row to the block + } + + if (blockSumT+rowSizeT > gatherNodesPerBlock) { //adding this row will exceed the nnz per block + NrowBlocksT++; //count the previous block + blockSumT=rowSizeT; //start a new row block + } else { + blockSumT+=rowSizeT; //add this row to the block + } + } + + blockRowStartsN.calloc(NrowBlocksN+1); + blockRowStartsT.calloc(NrowBlocksT+1); + + blockSumN=0, blockSumT=0; + NrowBlocksN=0, NrowBlocksT=0; + if (NrowsN) NrowBlocksN++; + if (NrowsT) NrowBlocksT++; + + for (dlong i=0;i gatherNodesPerBlock) { //adding this row will exceed the nnz per block + blockRowStartsN[NrowBlocksN++] = i; //mark the previous block + blockSumN=rowSizeN; //start a new row block + } else { + blockSumN+=rowSizeN; //add this row to the block + } + if (blockSumT+rowSizeT > gatherNodesPerBlock) { //adding this row will exceed the nnz per block + blockRowStartsT[NrowBlocksT++] = i; //mark the previous block + blockSumT=rowSizeT; //start a new row block + } else { + blockSumT+=rowSizeT; //add this row to the block + } + } + blockRowStartsN[NrowBlocksN] = NrowsN; + blockRowStartsT[NrowBlocksT] = NrowsT; + + o_blockRowStartsN = platform.malloc(blockRowStartsN); + o_blockRowStartsT = platform.malloc(blockRowStartsT); +} + +void ogsOperator_t::Free() { + rowStartsT.free(); + colIdsT.free(); + rowStartsN.free(); + colIdsN.free(); + + o_rowStartsT.free(); + o_colIdsT.free(); + o_rowStartsN.free(); + o_colIdsN.free(); + + blockRowStartsT.free(); + blockRowStartsN.free(); + o_blockRowStartsN.free(); + o_blockRowStartsT.free(); + + nnzN=0; + nnzT=0; + NrowsN=0; + NrowsT=0; + Ncols=0; + NrowBlocksN=0; + NrowBlocksT=0; +} + + +template class U, + template class V, + typename T> +void extract(const dlong N, + const int K, + const memory ids, + const U q, + V gatherq) { + + const T*__restrict__ q_ptr = q.ptr(); + T*__restrict__ gatherq_ptr = gatherq.ptr(); + + if (K==1) { + for(dlong n=0;n ids, + const memory q, memory gatherq); +template void extract(const dlong N, const int K, const memory ids, + const memory q, memory gatherq); +template void extract(const dlong N, const int K, const memory ids, + const memory q, memory gatherq); +template void extract(const dlong N, const int K, const memory ids, + const memory q, memory gatherq); + +template void extract(const dlong N, const int K, const memory ids, + const pinnedMemory q, pinnedMemory gatherq); +template void extract(const dlong N, const int K, const memory ids, + const pinnedMemory q, pinnedMemory gatherq); +template void extract(const dlong N, const int K, const memory ids, + const pinnedMemory q, pinnedMemory gatherq); +template void extract(const dlong N, const int K, const memory ids, + const pinnedMemory q, pinnedMemory gatherq); + +} //namespace ogs + +} //namespace libp diff --git a/libs/ogs/ogsPairwise.cpp b/libs/ogs/ogsPairwise.cpp new file mode 100644 index 000000000..91098328d --- /dev/null +++ b/libs/ogs/ogsPairwise.cpp @@ -0,0 +1,430 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "ogs.hpp" +#include "ogs/ogsUtils.hpp" +#include "ogs/ogsExchange.hpp" + +#ifdef GLIBCXX_PARALLEL +#include +using __gnu_parallel::sort; +#else +using std::sort; +#endif + +namespace libp { + +namespace ogs { + +/********************************** +* Host exchange +***********************************/ +template +inline void ogsPairwise_t::Start(pinnedMemory &buf, const int k, + const Op op, const Transpose trans){ + + pinnedMemory sendBuf = h_sendspace; + + const int NranksSend = (trans==NoTrans) ? NranksSendN : NranksSendT; + const int NranksRecv = (trans==NoTrans) ? NranksRecvN : NranksRecvT; + const int *sendRanks = (trans==NoTrans) ? sendRanksN.ptr() : sendRanksT.ptr(); + const int *recvRanks = (trans==NoTrans) ? recvRanksN.ptr() : recvRanksT.ptr(); + const int *sendCounts = (trans==NoTrans) ? sendCountsN.ptr() : sendCountsT.ptr(); + const int *recvCounts = (trans==NoTrans) ? recvCountsN.ptr() : recvCountsT.ptr(); + const int *sendOffsets= (trans==NoTrans) ? sendOffsetsN.ptr() : sendOffsetsT.ptr(); + const int *recvOffsets= (trans==NoTrans) ? recvOffsetsN.ptr() : recvOffsetsT.ptr(); + + //post recvs + for (int r=0;r +inline void ogsPairwise_t::Finish(pinnedMemory &buf, const int k, + const Op op, const Transpose trans){ + + const int NranksSend = (trans==NoTrans) ? NranksSendN : NranksSendT; + const int NranksRecv = (trans==NoTrans) ? NranksRecvN : NranksRecvT; + const int *recvOffsets= (trans==NoTrans) ? recvOffsetsN.ptr() : recvOffsetsT.ptr(); + + comm.Waitall(NranksRecv+NranksSend, requests); + + //if we recvieved anything via MPI, gather the recv buffer and scatter + // it back to to original vector + dlong Nrecv = recvOffsets[NranksRecv]; + if (Nrecv) { + // gather the recieved nodes + postmpi.Gather(buf, buf, k, op, trans); + } +} + +void ogsPairwise_t::Start(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsPairwise_t::Start(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsPairwise_t::Start(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsPairwise_t::Start(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsPairwise_t::Finish(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsPairwise_t::Finish(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsPairwise_t::Finish(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsPairwise_t::Finish(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } + +/********************************** +* GPU-aware exchange +***********************************/ +template +void ogsPairwise_t::Start(deviceMemory &o_buf, + const int k, + const Op op, + const Transpose trans){ + + const dlong Nsend = (trans == NoTrans) ? NsendN : NsendT; + + if (Nsend) { + deviceMemory o_sendBuf = o_sendspace; + + // assemble the send buffer on device + if (trans == NoTrans) { + extractKernel[ogsType::get()](NsendN, k, o_sendIdsN, o_buf, o_sendBuf); + } else { + extractKernel[ogsType::get()](NsendT, k, o_sendIdsT, o_buf, o_sendBuf); + } + //wait for kernel to finish on default stream + device_t &device = platform.device; + device.finish(); + } +} + +template +void ogsPairwise_t::Finish(deviceMemory &o_buf, + const int k, + const Op op, + const Transpose trans){ + + deviceMemory o_sendBuf = o_sendspace; + + const int NranksSend = (trans==NoTrans) ? NranksSendN : NranksSendT; + const int NranksRecv = (trans==NoTrans) ? NranksRecvN : NranksRecvT; + const int *sendRanks = (trans==NoTrans) ? sendRanksN.ptr() : sendRanksT.ptr(); + const int *recvRanks = (trans==NoTrans) ? recvRanksN.ptr() : recvRanksT.ptr(); + const int *sendCounts = (trans==NoTrans) ? sendCountsN.ptr() : sendCountsT.ptr(); + const int *recvCounts = (trans==NoTrans) ? recvCountsN.ptr() : recvCountsT.ptr(); + const int *sendOffsets= (trans==NoTrans) ? sendOffsetsN.ptr() : sendOffsetsT.ptr(); + const int *recvOffsets= (trans==NoTrans) ? recvOffsetsN.ptr() : recvOffsetsT.ptr(); + + //post recvs + for (int r=0;r &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsPairwise_t::Start(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsPairwise_t::Start(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsPairwise_t::Start(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsPairwise_t::Finish(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsPairwise_t::Finish(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsPairwise_t::Finish(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsPairwise_t::Finish(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } + +ogsPairwise_t::ogsPairwise_t(dlong Nshared, + memory &sharedNodes, + ogsOperator_t& gatherHalo, + stream_t _dataStream, + comm_t _comm, + platform_t &_platform): + ogsExchange_t(_platform,_comm,_dataStream) { + + Nhalo = gatherHalo.NrowsT; + NhaloP = gatherHalo.NrowsN; + + // sort the list by rank to the order where they will be sent by MPI_Allgatherv + sort(sharedNodes.ptr(), sharedNodes.ptr()+Nshared, + [](const parallelNode_t& a, const parallelNode_t& b) { + if(a.rank < b.rank) return true; //group by rank + if(a.rank > b.rank) return false; + + return a.newId < b.newId; //then order by the localId relative to this rank + }); + + //make mpi allgatherv counts and offsets + memory mpiSendCountsT(size,0); + memory mpiSendCountsN(size,0); + memory mpiRecvCountsT(size); + memory mpiRecvCountsN(size); + memory mpiSendOffsetsT(size+1); + memory mpiSendOffsetsN(size+1); + memory mpiRecvOffsetsT(size+1); + memory mpiRecvOffsetsN(size+1); + + for (dlong n=0;n0) mpiSendCountsN[r]++; + mpiSendCountsT[r]++; + } + + //shared counts + comm.Alltoall(mpiSendCountsT, mpiRecvCountsT); + comm.Alltoall(mpiSendCountsN, mpiRecvCountsN); + + //cumulative sum + mpiSendOffsetsN[0] = 0; + mpiSendOffsetsT[0] = 0; + mpiRecvOffsetsN[0] = 0; + mpiRecvOffsetsT[0] = 0; + for (int r=0;r recvNodes(Nrecv); + + //Send list of nodes to each rank + comm.Alltoallv(sharedNodes, mpiSendCountsT, mpiSendOffsetsT, + recvNodes, mpiRecvCountsT, mpiRecvOffsetsT); + + //make ops for gathering halo nodes after an MPI_Allgatherv + postmpi.platform = platform; + postmpi.kind = Signed; + + postmpi.NrowsN = Nhalo; + postmpi.NrowsT = Nhalo; + postmpi.rowStartsN.calloc(Nhalo+1); + postmpi.rowStartsT.calloc(Nhalo+1); + + //make array of counters + memory haloGatherTCounts(Nhalo); + memory haloGatherNCounts(Nhalo); + + //count the data that will already be in h_haloBuf.ptr() + for (dlong n=0;n0) ? 1 : 0; + NranksSendT += (mpiSendCountsT[r]>0) ? 1 : 0; + NranksRecvN += (mpiRecvCountsN[r]>0) ? 1 : 0; + NranksRecvT += (mpiRecvCountsT[r]>0) ? 1 : 0; + } + + sendRanksN.calloc(NranksSendN); + sendRanksT.calloc(NranksSendT); + recvRanksN.calloc(NranksRecvN); + recvRanksT.calloc(NranksRecvT); + sendCountsN.calloc(NranksSendN); + sendCountsT.calloc(NranksSendT); + recvCountsN.calloc(NranksRecvN); + recvCountsT.calloc(NranksRecvT); + sendOffsetsN.calloc(NranksSendN+1); + sendOffsetsT.calloc(NranksSendT+1); + recvOffsetsN.calloc(NranksRecvN+1); + recvOffsetsT.calloc(NranksRecvT+1); + + //reset + NranksSendN=0; + NranksSendT=0; + NranksRecvN=0; + NranksRecvT=0; + for (int r=0;r0) { + sendRanksN[NranksSendN] = r; + sendCountsN[NranksSendN] = mpiSendCountsN[r]; + sendOffsetsN[NranksSendN] = mpiSendOffsetsN[r]; + NranksSendN++; + } + if (mpiSendCountsT[r]>0) { + sendRanksT[NranksSendT] = r; + sendCountsT[NranksSendT] = mpiSendCountsT[r]; + sendOffsetsT[NranksSendT] = mpiSendOffsetsT[r]; + NranksSendT++; + } + if (mpiRecvCountsN[r]>0) { + recvRanksN[NranksRecvN] = r; + recvCountsN[NranksRecvN] = mpiRecvCountsN[r]; + recvOffsetsN[NranksRecvN] = mpiRecvOffsetsN[r]; + NranksRecvN++; + } + if (mpiRecvCountsT[r]>0) { + recvRanksT[NranksRecvT] = r; + recvCountsT[NranksRecvT] = mpiRecvCountsT[r]; + recvOffsetsT[NranksRecvT] = mpiRecvOffsetsT[r]; + NranksRecvT++; + } + } + sendOffsetsN[NranksSendN] = mpiSendOffsetsN[size]; + sendOffsetsT[NranksSendT] = mpiSendOffsetsT[size]; + recvOffsetsN[NranksRecvN] = mpiRecvOffsetsN[size]; + recvOffsetsT[NranksRecvT] = mpiRecvOffsetsT[size]; + + requests.malloc(NranksSendT+NranksRecvT); + + //make scratch space + AllocBuffer(sizeof(dfloat)); +} + +void ogsPairwise_t::AllocBuffer(size_t Nbytes) { + if (o_workspace.size() < postmpi.nnzT*Nbytes) { + h_workspace = platform.hostMalloc(postmpi.nnzT*Nbytes); + o_workspace = platform.malloc(postmpi.nnzT*Nbytes); + } + if (o_sendspace.size() < NsendT*Nbytes) { + h_sendspace = platform.hostMalloc(NsendT*Nbytes); + o_sendspace = platform.malloc(NsendT*Nbytes); + } +} + +} //namespace ogs + +} //namespace libp diff --git a/libs/ogs/ogsSetup.cpp b/libs/ogs/ogsSetup.cpp index 8993cd894..0eee174dd 100644 --- a/libs/ogs/ogsSetup.cpp +++ b/libs/ogs/ogsSetup.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,520 +25,896 @@ SOFTWARE. */ #include "ogs.hpp" -#include "ogs/ogsKernels.hpp" - -typedef struct{ +#include "ogs/ogsUtils.hpp" +#include "ogs/ogsOperator.hpp" +#include "ogs/ogsExchange.hpp" +#include "timer.hpp" + +#ifdef GLIBCXX_PARALLEL +#include +using __gnu_parallel::sort; +#else +using std::sort; +#endif + +namespace libp { + +namespace ogs { + +void ogs_t::Setup(const dlong _N, + memory ids, + comm_t _comm, + const Kind _kind, + const Method method, + const bool _unique, + const bool verbose, + platform_t& _platform){ + ogsBase_t::Setup(_N, ids, _comm, _kind, method, _unique, verbose, _platform); +} - dlong localId; // local node id - hlong baseId; // original global index +void halo_t::Setup(const dlong _N, + memory ids, + comm_t _comm, + const Method method, + const bool verbose, + platform_t& _platform){ + ogsBase_t::Setup(_N, ids, _comm, Halo, method, false, verbose, _platform); - dlong newId; // new global id - int sign; + Nhalo = NhaloT - NhaloP; //number of extra recieved nodes +} -}parallelNode_t; +/******************************** + * Setup + ********************************/ +void ogsBase_t::Setup(const dlong _N, + memory ids, + comm_t _comm, + const Kind _kind, + const Method method, + const bool _unique, + const bool verbose, + platform_t& _platform){ -void setupRowBlocks(ogsData_t &A, platform_t &platform); + //release resources if this ogs was setup before + Free(); -ogs_t *ogs_t::Setup(dlong N, hlong *ids, MPI_Comm &comm, - int verbose, platform_t& platform){ + timePoint_t start = Time(); - ogs_t *ogs = new ogs_t(platform, comm); + platform = _platform; - //Keep track of how many gs handles we've created, and - // build kernels if this is the first - if (!ogs::Nrefs) ogs::initKernels(platform); - ogs::Nrefs++; + if (!dataStream.isInitialized()) + dataStream = platform.device.createStream(); - ogs->N = N; + N = _N; + comm = _comm; + kind = _kind; + unique = _unique; int rank, size; - MPI_Comm_rank(ogs->comm, &rank); - MPI_Comm_size(ogs->comm, &size); - - //use the host gs to find what nodes are local to this rank - int *minRank = (int *) calloc(N,sizeof(int)); - int *maxRank = (int *) calloc(N,sizeof(int)); - hlong *flagIds = (hlong *) calloc(N,sizeof(hlong)); - for (dlong i=0;i nodes(Nids); + + //fill the data (squeezing out zero ids) + Nids=0; + for (dlong n=0;n sharedNodes; + ConstructSharedNodes(Nids, nodes, Nshared, sharedNodes); - //count local and halo nodes - ogs->Nlocal=0; ogs->Nhalo=0; - for (dlong i=0;iNhalo++; - } else { - ogs->Nlocal++; + //if we altered the signs of ids, write them back + if (unique) + ids[n] = nodes[Nids].baseId; + + Nids++; } } - //set up the local gatherScatter - parallelNode_t *localNodes = (parallelNode_t*) calloc(ogs->Nlocal,sizeof(parallelNode_t)); + //setup local gather operators + if (kind==Signed) + LocalSignedSetup(Nids, nodes); + else if (kind==Unsigned) + LocalUnsignedSetup(Nids, nodes); + else + LocalHaloSetup(Nids, nodes); + + //with that, we're done with the local nodes list + nodes.free(); + + // At this point, we've setup gs operators to gather/scatter the purely local nodes, + // and gather/scatter the shared halo nodes to/from a coalesced ordering. We now + // need gs operators to scatter/gather the coalesced halo nodes to/from the expected + // orderings for MPI communications. + + if (method == AllToAll) { + exchange = std::shared_ptr( + new ogsAllToAll_t(Nshared, sharedNodes, + *gatherHalo, dataStream, + comm, platform)); + } else if (method == Pairwise) { + exchange = std::shared_ptr( + new ogsPairwise_t(Nshared, sharedNodes, + *gatherHalo, dataStream, + comm, platform)); + } else if (method == CrystalRouter) { + exchange = std::shared_ptr( + new ogsCrystalRouter_t(Nshared, sharedNodes, + *gatherHalo, dataStream, + comm, platform)); + } else { //Auto + exchange = std::shared_ptr( + AutoSetup(Nshared, sharedNodes, + *gatherHalo, comm, + platform, verbose)); + } - dlong cnt=0; - for (dlong i=0;iNlocal, - [](const parallelNode_t& a, const parallelNode_t& b) { - if(abs(a.baseId) < abs(b.baseId)) return true; //group by abs(baseId) - if(abs(a.baseId) > abs(b.baseId)) return false; - - if(a.baseId > b.baseId) return true; //positive ids first - if(a.baseId < b.baseId) return false; - - return (a.localId < b.localId); //sort by local id - }); - - //flag each set of ids by whether there is at least one positive id - // and count how many local gather/scatter nodes we have - ogs->localGather.Nrows = 0; - ogs->localScatter.Nrows = 0; - if (ogs->Nlocal) { - localNodes[0].newId = 0; - int sign = (localNodes[0].baseId > 0) ? 1 : -1; - localNodes[0].sign = sign; - if (sign > 0) ogs->localGather.Nrows++; - - for (dlong i=1;iNlocal;i++) { - if (abs(localNodes[i].baseId)!=abs(localNodes[i-1].baseId)) { - sign = (localNodes[i].baseId > 0) ? 1 : -1; - ogs->localScatter.Nrows++; - if (sign > 0) ogs->localGather.Nrows++; - } +void ogsBase_t::FindSharedNodes(const dlong Nids, + memory &nodes, + const int verbose){ - localNodes[i].newId = ogs->localScatter.Nrows; - localNodes[i].sign = sign; - } - ogs->localScatter.Nrows++; + int rank, size; + rank = comm.rank(); + size = comm.size(); + + memory sendCounts(size,0); + memory recvCounts(size); + memory sendOffsets(size+1); + memory recvOffsets(size+1); + + //count number of ids we're sending + for (dlong n=0;nNlocal, - [](const parallelNode_t& a, const parallelNode_t& b) { - return (a.localId < b.localId); //sort by local id - }); + comm.Alltoall(sendCounts, recvCounts); - //tally up how many nodes are being gathered to each gatherNode and - // map to a local ordering - dlong *localGatherCounts = (dlong*) calloc(ogs->localScatter.Nrows,sizeof(dlong)); - dlong *localScatterCounts = (dlong*) calloc(ogs->localScatter.Nrows,sizeof(dlong)); + sendOffsets[0] = 0; + recvOffsets[0] = 0; + for (int r=0;rlocalScatter.Nrows,sizeof(dlong)); + //reset counter + sendCounts[r] = 0; + } - for (dlong i=0;ilocalScatter.Nrows;i++) localMap[i] = -1; //initialize map + //write a send ordering into newIds + for (dlong n=0;nlocalGather.Nrows; - for (dlong i=0;iNlocal;i++) { - dlong newId = localNodes[i].newId; //get the ordered id + // permute the list to send ordering + permute(Nids, nodes, [](const parallelNode_t& a) { return a.newId; } ); - //record a new index if this is a new gatherNode (pure negative nodes appended at the end) - if (localMap[newId]==-1) { - if (localNodes[i].sign > 0) - localMap[newId] = cnt++; - else - localMap[newId] = cnt2++; - } + dlong recvN = recvOffsets[size]; //total ids to recv - dlong gid = localMap[newId]; - localNodes[i].newId = gid; //reorder - localScatterCounts[gid]++; //tally - if (localNodes[i].baseId > 0) - localGatherCounts[gid]++; //tally - } - free(localMap); + memory recvNodes(recvN); - ogs->localGather.rowStarts = (dlong*) calloc(ogs->localScatter.Nrows+1,sizeof(dlong)); - ogs->localScatter.rowStarts = (dlong*) calloc(ogs->localScatter.Nrows+1,sizeof(dlong)); - for (dlong i=0;ilocalScatter.Nrows;i++) { - ogs->localGather.rowStarts[i+1] = ogs->localGather.rowStarts[i] + localGatherCounts[i]; - ogs->localScatter.rowStarts[i+1] = ogs->localScatter.rowStarts[i] + localScatterCounts[i]; + //Send all the nodes to their destination rank. + comm.Alltoallv( nodes, sendCounts, sendOffsets, + recvNodes, recvCounts, recvOffsets); - //reset counters - localScatterCounts[i] = 0; - localGatherCounts[i] = 0; + //remember this ordering + for (dlong n=0;nlocalGather.nnz = ogs->localGather.rowStarts[ogs->localGather.Nrows]; - ogs->localScatter.nnz = ogs->localScatter.rowStarts[ogs->localScatter.Nrows]; + // sort based on base ids + sort(recvNodes.ptr(), recvNodes.ptr()+recvN, + [](const parallelNode_t& a, const parallelNode_t& b) { + return abs(a.baseId) < abs(b.baseId); + }); + + // We now have a collection of nodes associated with some subset of all global Ids + // Our list is sorted by baseId to group nodes with the same globalId together + // We now want to flag which nodes are shared via MPI + + int is_unique=1; + + dlong Nshared=0; + + dlong start=0; + for (dlong n=0;nlocalGather.colIds = (dlong*) calloc(ogs->localGather.nnz+1,sizeof(dlong)); //extra entry so the occa buffer will actually exist - ogs->localScatter.colIds = (dlong*) calloc(ogs->localScatter.nnz+1,sizeof(dlong)); //extra entry so the occa buffer will actually exist - for (dlong i=0;iNlocal;i++) { - dlong gid = localNodes[i].newId; + //pick a random node in this group + const int m = (rand() % (end-start)); - dlong soffset = ogs->localScatter.rowStarts[gid]; - int sindex = localScatterCounts[gid]; - ogs->localScatter.colIds[soffset+sindex] = localNodes[i].localId; - localScatterCounts[gid]++; + for (int i=start;i 0) { - dlong goffset = ogs->localGather.rowStarts[gid]; - int gindex = localGatherCounts[gid]; - ogs->localGather.colIds[goffset+gindex] = localNodes[i].localId; - localGatherCounts[gid]++; + recvNodes[start+m].baseId = baseId; + positiveCount=1; + } else { + //count how many postive baseIds there are in this group + for (int i=start;i0) positiveCount++; + + //if we didnt find a sole positive baseId, the gather is not well-defined + if (positiveCount!=1) is_unique=0; + } + + // When making a halo excahnge, check that we have a leading positive id + LIBP_ABORT("Found " << positiveCount << " positive Ids for baseId: " + << abs(recvNodes[start].baseId)<< ".", + kind==Halo && positiveCount!=1); + + //determine if this node is shared via MPI, + int shared=1; + const int r = recvNodes[start].rank; + for (int i=start+1;ilocalGather.o_rowStarts = platform.malloc((ogs->localScatter.Nrows+1)*sizeof(dlong), ogs->localGather.rowStarts); - ogs->localScatter.o_rowStarts = platform.malloc((ogs->localScatter.Nrows+1)*sizeof(dlong), ogs->localScatter.rowStarts); + //shared the unique node check so we know if the gather operation is well-defined + comm.Allreduce(is_unique, Comm::Min); + gather_defined = (is_unique==1); - ogs->localGather.o_colIds = platform.malloc((ogs->localGather.nnz+1)*sizeof(dlong), ogs->localGather.colIds); - ogs->localScatter.o_colIds = platform.malloc((ogs->localScatter.nnz+1)*sizeof(dlong), ogs->localScatter.colIds); + hlong Nshared_global = Nshared; + comm.Reduce(Nshared_global, 0); + if (!rank && verbose) { + std::cout << "ogs Setup: " << Nshared_global << " unique labels shared." << std::endl; + } - //divide the list of colIds into roughly equal sized blocks so that each - // threadblock loads approxiamtely an equal amount of data - setupRowBlocks(ogs->localGather, platform); - setupRowBlocks(ogs->localScatter, platform); - - free(localNodes); - - //make some compressed versions of the gather/scatter ids for the fused gs kernel - ogs->fusedGather.Nrows=0; - ogs->fusedScatter.Nrows=0; - ogs->symGatherScatter.Nrows=0; - - ogs->fusedGather.nnz=0; - ogs->fusedScatter.nnz=0; - ogs->symGatherScatter.nnz=0; - - for (dlong n=0;nlocalScatter.Nrows;n++) { - int gatherCnt = ogs->localGather.rowStarts[n+1] -ogs->localGather.rowStarts[n]; - int scatterCnt = ogs->localScatter.rowStarts[n+1]-ogs->localScatter.rowStarts[n]; - - //only include this node if either the gather or scatter interact with mulitple nodes - // otherwise the op is identity and ignored - if ((gatherCnt>1)||(scatterCnt>1)) { - ogs->fusedGather.Nrows++; - ogs->fusedScatter.Nrows++; - ogs->fusedGather.nnz += gatherCnt; - ogs->fusedScatter.nnz += scatterCnt; - } + //at this point each collection of baseIds either has all nodes have + // sign = 1, meaning all the nodes with this baseId are on the + // same rank, or have sign=2, meaning that baseId must be communicated + + // permute recv nodes back to recv'd ordering + permute(recvN, recvNodes, [](const parallelNode_t& a) { return a.newId; } ); + + //Return all the nodes to their origin rank. + comm.Alltoallv(recvNodes, recvCounts, recvOffsets, + nodes, sendCounts, sendOffsets); +} + +void ogsBase_t::ConstructSharedNodes(const dlong Nids, + memory &nodes, + dlong &Nshared, + memory &sharedNodes) { + + int size = comm.size(); + + // sort based on abs(baseId) + sort(nodes.ptr(), nodes.ptr()+Nids, + [](const parallelNode_t& a, const parallelNode_t& b) { + if(abs(a.baseId) < abs(b.baseId)) return true; //group by abs(baseId) + if(abs(a.baseId) > abs(b.baseId)) return false; + + return a.baseId > b.baseId; //positive ids on a rank first + }); + + //count how many unique global Ids we have on this rank + // and flag baseId groups that have a positive baseId somewhere on this rank + dlong NbaseIds=0; + NlocalT=0; NlocalP=0; + NhaloT=0; NhaloP=0; + dlong start=0; + for (dlong n=0;n1) { - ogs->symGatherScatter.Nrows++; - ogs->symGatherScatter.nnz += scatterCnt; + NbaseIds++; + start = end; } } - ogs->fusedGather.rowStarts = (dlong*) calloc(ogs->fusedScatter.Nrows+1,sizeof(dlong)); - ogs->fusedScatter.rowStarts = (dlong*) calloc(ogs->fusedScatter.Nrows+1,sizeof(dlong)); - ogs->symGatherScatter.rowStarts = (dlong*) calloc(ogs->symGatherScatter.Nrows+1,sizeof(dlong)); - - ogs->fusedGather.colIds = (dlong*) calloc(ogs->fusedGather.nnz+1,sizeof(dlong)); - ogs->fusedScatter.colIds = (dlong*) calloc(ogs->fusedScatter.nnz+1,sizeof(dlong)); - ogs->symGatherScatter.colIds = (dlong*) calloc(ogs->symGatherScatter.nnz+1,sizeof(dlong)); - - //reset counters - ogs->fusedGather.Nrows=0; - ogs->fusedScatter.Nrows=0; - ogs->symGatherScatter.Nrows=0; - - ogs->fusedGather.nnz=0; - ogs->fusedScatter.nnz=0; - ogs->symGatherScatter.nnz=0; - for (dlong n=0;nlocalScatter.Nrows;n++) { - int gatherCnt = ogs->localGather.rowStarts[n+1] -ogs->localGather.rowStarts[n]; - int scatterCnt = ogs->localScatter.rowStarts[n+1]-ogs->localScatter.rowStarts[n]; - - //only include this node if either the gather and scatter interact with mulitple nodes - // otherwise the op is identity and ignored - if ((gatherCnt>1)||(scatterCnt>1)) { - ogs->fusedGather.Nrows++; - ogs->fusedScatter.Nrows++; - ogs->fusedGather.rowStarts[ogs->fusedGather.Nrows] = gatherCnt + ogs->fusedGather.rowStarts[ogs->fusedGather.Nrows-1]; - ogs->fusedScatter.rowStarts[ogs->fusedScatter.Nrows] = scatterCnt + ogs->fusedScatter.rowStarts[ogs->fusedScatter.Nrows-1]; - - for (int i=ogs->localGather.rowStarts[n];ilocalGather.rowStarts[n+1];i++) - ogs->fusedGather.colIds[ogs->fusedGather.nnz++] = ogs->localGather.colIds[i]; - - for (int i=ogs->localScatter.rowStarts[n];ilocalScatter.rowStarts[n+1];i++) - ogs->fusedScatter.colIds[ogs->fusedScatter.nnz++] = ogs->localScatter.colIds[i]; - } + //total number of positive owned gathered nodes + Ngather = NlocalP+NhaloP; - //for the sym op only the scatter ids are used - if (scatterCnt>1) { - ogs->symGatherScatter.Nrows++; - ogs->symGatherScatter.rowStarts[ogs->symGatherScatter.Nrows] = scatterCnt + ogs->symGatherScatter.rowStarts[ogs->symGatherScatter.Nrows-1]; + //global total + NgatherGlobal = Ngather; + comm.Allreduce(NgatherGlobal); - for (int i=ogs->localScatter.rowStarts[n];ilocalScatter.rowStarts[n+1];i++) - ogs->symGatherScatter.colIds[ogs->symGatherScatter.nnz++] = ogs->localScatter.colIds[i]; + //extract the leading node from each shared baseId + memory sendSharedNodes(NhaloT); + + NhaloT=0; + for (dlong n=0;nfusedGather.o_rowStarts = platform.malloc((ogs->fusedScatter.Nrows+1)*sizeof(dlong), ogs->fusedGather.rowStarts); - ogs->fusedScatter.o_rowStarts = platform.malloc((ogs->fusedScatter.Nrows+1)*sizeof(dlong), ogs->fusedScatter.rowStarts); - ogs->symGatherScatter.o_rowStarts = platform.malloc((ogs->symGatherScatter.Nrows+1)*sizeof(dlong), ogs->symGatherScatter.rowStarts); + // permute the list back to local id ordering + permute(Nids, nodes, [](const parallelNode_t& a) { return a.localId; } ); + + // Use the newId index to reorder the baseId groups based on + // the order we encouter them in their original ordering. + memory indexMap(NbaseIds, -1); + + dlong localCntN = 0, localCntT = NlocalP; //start point for local gather nodes + dlong haloCntN = 0, haloCntT = NhaloP; //start point for halo gather nodes + for (dlong n=0;nfusedGather.o_colIds = platform.malloc((ogs->fusedGather.nnz+1)*sizeof(dlong), ogs->fusedGather.colIds); - ogs->fusedScatter.o_colIds = platform.malloc((ogs->fusedScatter.nnz+1)*sizeof(dlong), ogs->fusedScatter.colIds); - ogs->symGatherScatter.o_colIds = platform.malloc((ogs->symGatherScatter.nnz+1)*sizeof(dlong), ogs->symGatherScatter.colIds); + const dlong gid = indexMap[newId]; + nodes[n].newId = gid; //reorder + } - setupRowBlocks(ogs->fusedGather, platform); - setupRowBlocks(ogs->fusedScatter, platform); - setupRowBlocks(ogs->symGatherScatter, platform); + //re-order the shared node list + for (dlong n=0;nfusedGather.blockRowStarts) free(ogs->fusedGather.blockRowStarts); - ogs->fusedGather.o_blockRowStarts.free(); - ogs->fusedGather.NrowBlocks = ogs->fusedScatter.NrowBlocks; - ogs->fusedGather.blockRowStarts = ogs->fusedScatter.blockRowStarts; - ogs->fusedGather.o_blockRowStarts = ogs->fusedScatter.o_blockRowStarts; + indexMap.free(); - //set up the halo gatherScatter - parallelNode_t *haloNodes = (parallelNode_t*) calloc(ogs->Nhalo+1,sizeof(parallelNode_t)); + memory sendCounts(size,0); + memory recvCounts(size); + memory sendOffsets(size+1); + memory recvOffsets(size+1); - cnt=0; - for (dlong i=0;iNhalo, - [](const parallelNode_t& a, const parallelNode_t& b) { - if(abs(a.baseId) < abs(b.baseId)) return true; //group by abs(baseId) - if(abs(a.baseId) > abs(b.baseId)) return false; + comm.Alltoall(sendCounts, recvCounts); + + sendOffsets[0] = 0; + recvOffsets[0] = 0; + for (int r=0;r recvSharedNodes(recvN); + + //Send all the nodes to their destination rank. + comm.Alltoallv(sendSharedNodes, sendCounts, sendOffsets, + recvSharedNodes, recvCounts, recvOffsets); + + //free up some space + sendSharedNodes.free(); + sendCounts.free(); + recvCounts.free(); + sendOffsets.free(); + recvOffsets.free(); + + // sort based on base ids + sort(recvSharedNodes.ptr(), recvSharedNodes.ptr()+recvN, + [](const parallelNode_t& a, const parallelNode_t& b) { + return abs(a.baseId) < abs(b.baseId); + }); + + //count number of shared nodes we will be sending + memory sharedSendCounts(size,0); + memory sharedRecvCounts(size); + memory sharedSendOffsets(size+1); + memory sharedRecvOffsets(size+1); + + start=0; + for (dlong n=0;n b.baseId) return true; //positive ids first - if(a.baseId < b.baseId) return false; + //set new baseId group start point + start=n+1; + } + } - return (a.localId < b.localId); //sort by local id - }); + // Each rank has a set of shared global Ids and for each global id that + // rank knows what MPI ranks participate in gathering. We now send this + // information to the involved ranks. - ogs->haloGather.Nrows = 0; - ogs->haloScatter.Nrows = 0; + //share counts + comm.Alltoall(sharedSendCounts, sharedRecvCounts); - if (ogs->Nhalo) { - haloNodes[0].newId = 0; - int sign = (haloNodes[0].baseId > 0) ? 1 : -1; - haloNodes[0].sign = sign; - if (sign > 0) ogs->haloGather.Nrows++; + //cumulative sum + sharedSendOffsets[0] = 0; + sharedRecvOffsets[0] = 0; + for (int r=0;rNhalo;i++) { - if (abs(haloNodes[i].baseId)!=abs(haloNodes[i-1].baseId)) { - sign = (haloNodes[i].baseId > 0) ? 1 : -1; - ogs->haloScatter.Nrows++; - if (sign > 0) ogs->haloGather.Nrows++; + //make a send buffer + memory sharedSendNodes(sharedSendOffsets[size]); + + //reset sendCounts + for (int r=0;rhaloScatter.Nrows; - haloNodes[i].sign = sign; + //set new baseId group start point + start=n+1; } - ogs->haloScatter.Nrows++; } + recvSharedNodes.free(); + + //make sharedNodes to hold the exchange data we recv + Nshared = sharedRecvOffsets[size]; + sharedNodes = memory(Nshared); + + //Share all the gathering info + comm.Alltoallv(sharedSendNodes, sharedSendCounts, sharedSendOffsets, + sharedNodes, sharedRecvCounts, sharedRecvOffsets); +} + +//Make local and halo gather operators using nodes list +void ogsBase_t::LocalSignedSetup(const dlong Nids, memory &nodes){ + + gatherLocal = std::make_shared(platform); + gatherHalo = std::make_shared(platform); + + gatherLocal->kind = Signed; + gatherHalo->kind = Signed; + + gatherLocal->Ncols = N; + gatherHalo->Ncols = N; - // sort based on local ids - std::sort(haloNodes, haloNodes+ogs->Nhalo, - [](const parallelNode_t& a, const parallelNode_t& b) { - return (a.localId < b.localId); //sort by local id - }); + gatherLocal->NrowsN = NlocalP; + gatherLocal->NrowsT = NlocalT; + gatherHalo->NrowsN = NhaloP; + gatherHalo->NrowsT = NhaloT; //tally up how many nodes are being gathered to each gatherNode and // map to a local ordering - dlong *haloGatherCounts = (dlong*) calloc(ogs->haloGather.Nrows+1,sizeof(dlong)); - dlong *haloScatterCounts = (dlong*) calloc(ogs->haloScatter.Nrows+1,sizeof(dlong)); - dlong *haloMap = (dlong*) calloc(ogs->haloScatter.Nrows+1,sizeof(dlong)); - hlong *haloIds = (hlong *) calloc(ogs->haloScatter.Nrows+1,sizeof(hlong)); - hlong *haloIdsSym = (hlong *) calloc(ogs->haloScatter.Nrows+1,sizeof(hlong)); - - for (dlong i=0;ihaloScatter.Nrows;i++) haloMap[i] = -1; //initialize map - - cnt = 0; - cnt2 = ogs->haloGather.Nrows; - for (dlong i=0;iNhalo;i++) { - dlong newId = haloNodes[i].newId; //get the ordered id - - if (haloMap[newId] == -1) { - if (haloNodes[i].sign > 0) - haloMap[newId] = cnt++; - else - haloMap[newId] = cnt2++; - - //record the base id of the gathered node - haloIds[haloMap[newId]] = haloNodes[i].sign*abs(haloNodes[i].baseId); - haloIdsSym[haloMap[newId]] = abs(haloNodes[i].baseId); + memory localGatherNCounts(gatherLocal->NrowsT,0); + memory localGatherTCounts(gatherLocal->NrowsT,0); + memory haloGatherNCounts(gatherHalo->NrowsT,0); + memory haloGatherTCounts(gatherHalo->NrowsT,0); + + for (dlong i=0;i0) localGatherNCounts[gid]++; //tally + localGatherTCounts[gid]++; //tally + } else { //halo + if (nodes[i].baseId>0) haloGatherNCounts[gid]++; //tally + haloGatherTCounts[gid]++; //tally } - - dlong gid = haloMap[newId]; - haloNodes[i].newId = gid; //reorder - haloScatterCounts[gid]++; //tally - if (haloNodes[i].baseId>0) - haloGatherCounts[gid]++; //tally } - free(haloMap); - ogs->haloGather.rowStarts = (dlong*) calloc(ogs->haloGather.Nrows+1,sizeof(dlong)); - ogs->haloScatter.rowStarts = (dlong*) calloc(ogs->haloScatter.Nrows+1,sizeof(dlong)); - for (dlong i=0;ihaloGather.Nrows;i++) { - ogs->haloGather.rowStarts[i+1] = ogs->haloGather.rowStarts[i] + haloGatherCounts[i]; - haloGatherCounts[i] = 0; + //make local row offsets + gatherLocal->rowStartsN.malloc(gatherLocal->NrowsT+1); + gatherLocal->rowStartsT.malloc(gatherLocal->NrowsT+1); + gatherLocal->rowStartsN[0] = 0; + gatherLocal->rowStartsT[0] = 0; + for (dlong i=0;iNrowsT;i++) { + gatherLocal->rowStartsN[i+1] = gatherLocal->rowStartsN[i] + localGatherNCounts[i]; + gatherLocal->rowStartsT[i+1] = gatherLocal->rowStartsT[i] + localGatherTCounts[i]; + localGatherNCounts[i] = 0; //reset counters + localGatherTCounts[i] = 0; //reset counters } - for (dlong i=0;ihaloScatter.Nrows;i++) { - ogs->haloScatter.rowStarts[i+1] = ogs->haloScatter.rowStarts[i] + haloScatterCounts[i]; - haloScatterCounts[i] = 0; + gatherLocal->nnzN = gatherLocal->rowStartsN[gatherLocal->NrowsT]; + gatherLocal->nnzT = gatherLocal->rowStartsT[gatherLocal->NrowsT]; + gatherLocal->colIdsN.malloc(gatherLocal->nnzN); + gatherLocal->colIdsT.malloc(gatherLocal->nnzT); + + //make halo row offsets + gatherHalo->rowStartsN.malloc(gatherHalo->NrowsT+1); + gatherHalo->rowStartsT.malloc(gatherHalo->NrowsT+1); + gatherHalo->rowStartsN[0] = 0; + gatherHalo->rowStartsT[0] = 0; + for (dlong i=0;iNrowsT;i++) { + gatherHalo->rowStartsN[i+1] = gatherHalo->rowStartsN[i] + haloGatherNCounts[i]; + gatherHalo->rowStartsT[i+1] = gatherHalo->rowStartsT[i] + haloGatherTCounts[i]; + haloGatherNCounts[i] = 0; + haloGatherTCounts[i] = 0; } + gatherHalo->nnzN = gatherHalo->rowStartsN[gatherHalo->NrowsT]; + gatherHalo->nnzT = gatherHalo->rowStartsT[gatherHalo->NrowsT]; + gatherHalo->colIdsN.malloc(gatherHalo->nnzN); + gatherHalo->colIdsT.malloc(gatherHalo->nnzT); - ogs->haloGather.nnz = ogs->haloGather.rowStarts[ogs->haloGather.Nrows]; - ogs->haloScatter.nnz = ogs->haloScatter.rowStarts[ogs->haloScatter.Nrows]; - - ogs->haloGather.colIds = (dlong*) calloc(ogs->haloGather.nnz+1,sizeof(dlong)); - ogs->haloScatter.colIds = (dlong*) calloc(ogs->haloScatter.nnz+1,sizeof(dlong)); - for (dlong i=0;iNhalo;i++) { - dlong gid = haloNodes[i].newId; - dlong soffset = ogs->haloScatter.rowStarts[gid]; - int sindex = haloScatterCounts[gid]; - ogs->haloScatter.colIds[soffset+sindex] = haloNodes[i].localId; - haloScatterCounts[gid]++; + for (dlong i=0;i 0) { - dlong goffset = ogs->haloGather.rowStarts[gid]; - int gindex = haloGatherCounts[gid]; - ogs->haloGather.colIds[goffset+gindex] = haloNodes[i].localId; - haloGatherCounts[gid]++; + if (abs(nodes[i].sign)==1) { //local gather group + if (nodes[i].baseId>0) { + const dlong soffset = gatherLocal->rowStartsN[gid]; + const int sindex = localGatherNCounts[gid]; + gatherLocal->colIdsN[soffset+sindex] = nodes[i].localId; + localGatherNCounts[gid]++; + } + const dlong soffset = gatherLocal->rowStartsT[gid]; + const int sindex = localGatherTCounts[gid]; + gatherLocal->colIdsT[soffset+sindex] = nodes[i].localId; + localGatherTCounts[gid]++; + } else { + if (nodes[i].baseId>0) { + const dlong soffset = gatherHalo->rowStartsN[gid]; + const int sindex = haloGatherNCounts[gid]; + gatherHalo->colIdsN[soffset+sindex] = nodes[i].localId; + haloGatherNCounts[gid]++; + } + const dlong soffset = gatherHalo->rowStartsT[gid]; + const int sindex = haloGatherTCounts[gid]; + gatherHalo->colIdsT[soffset+sindex] = nodes[i].localId; + haloGatherTCounts[gid]++; } } - free(haloGatherCounts); - free(haloScatterCounts); - - ogs->haloGather.o_rowStarts = platform.malloc((ogs->haloGather.Nrows+1)*sizeof(dlong), ogs->haloGather.rowStarts); - ogs->haloScatter.o_rowStarts = platform.malloc((ogs->haloScatter.Nrows+1)*sizeof(dlong), ogs->haloScatter.rowStarts); + localGatherNCounts.free(); + localGatherTCounts.free(); + haloGatherNCounts.free(); + haloGatherTCounts.free(); - ogs->haloGather.o_colIds = platform.malloc((ogs->haloGather.nnz+1)*sizeof(dlong), ogs->haloGather.colIds); - ogs->haloScatter.o_colIds = platform.malloc((ogs->haloScatter.nnz+1)*sizeof(dlong), ogs->haloScatter.colIds); + gatherLocal->o_rowStartsN = platform.malloc(gatherLocal->rowStartsN); + gatherLocal->o_rowStartsT = platform.malloc(gatherLocal->rowStartsT); + gatherLocal->o_colIdsN = platform.malloc(gatherLocal->colIdsN); + gatherLocal->o_colIdsT = platform.malloc(gatherLocal->colIdsT); - setupRowBlocks(ogs->haloGather, platform); - setupRowBlocks(ogs->haloScatter, platform); + gatherHalo->o_rowStartsN = platform.malloc(gatherHalo->rowStartsN); + gatherHalo->o_rowStartsT = platform.malloc(gatherHalo->rowStartsT); + gatherHalo->o_colIdsN = platform.malloc(gatherHalo->colIdsN); + gatherHalo->o_colIdsT = platform.malloc(gatherHalo->colIdsT); - free(haloNodes); + //divide the list of colIds into roughly equal sized blocks so that each + // threadblock loads approximately an equal amount of data + gatherLocal->setupRowBlocks(); + gatherHalo->setupRowBlocks(); +} - //make a host gs handle - ogs->Nlocal = ogs->localScatter.Nrows; - ogs->Nhalo = ogs->haloScatter.Nrows; - ogs->gsh = ogs::gsSetup(comm, ogs->Nhalo, haloIds, 0,0); - ogs->gshSym = ogs::gsSetup(comm, ogs->Nhalo, haloIdsSym, 0,0); +//Make local and halo gather operators using nodes list +void ogsBase_t::LocalUnsignedSetup(const dlong Nids, memory &nodes){ - free(haloIds); - free(haloIdsSym); + gatherLocal = std::make_shared(platform); + gatherHalo = std::make_shared(platform); - free(minRank); free(maxRank); + gatherLocal->kind = Unsigned; + gatherHalo->kind = Unsigned; - //total number of owned gathered nodes - ogs->Ngather = ogs->localGather.Nrows+ogs->haloGather.Nrows; + gatherLocal->Ncols = N; + gatherHalo->Ncols = N; - //total size of halo for gathered array - ogs->NgatherHalo = ogs->haloScatter.Nrows-ogs->haloGather.Nrows; + gatherLocal->NrowsN = NlocalP; + gatherLocal->NrowsT = NlocalT; + gatherHalo->NrowsN = NhaloP; + gatherHalo->NrowsT = NhaloT; - hlong NgatherLocal = (hlong) ogs->Ngather; - MPI_Allreduce(&NgatherLocal, &(ogs->NgatherGlobal), 1, MPI_HLONG, MPI_SUM, comm); + //tally up how many nodes are being gathered to each gatherNode and + // map to a local ordering + memory localGatherTCounts(gatherLocal->NrowsT,0); + memory haloGatherTCounts(gatherHalo->NrowsT,0); - ogs->hostBuf = nullptr; - ogs->haloBuf = nullptr; - ogs->hostBufSize = 0; + for (dlong i=0;irowStartsT.malloc(gatherLocal->NrowsT+1); + gatherLocal->rowStartsN = gatherLocal->rowStartsT; + gatherLocal->rowStartsT[0] = 0; + for (dlong i=0;iNrowsT;i++) { + gatherLocal->rowStartsT[i+1] = gatherLocal->rowStartsT[i] + localGatherTCounts[i]; + localGatherTCounts[i] = 0; //reset counters + } + gatherLocal->nnzT = gatherLocal->rowStartsT[gatherLocal->NrowsT]; + gatherLocal->nnzN = gatherLocal->nnzT; + gatherLocal->colIdsT.malloc(gatherLocal->nnzT); + gatherLocal->colIdsN = gatherLocal->colIdsT; + + //make halo row offsets + gatherHalo->rowStartsT.malloc(gatherHalo->NrowsT+1); + gatherHalo->rowStartsN = gatherHalo->rowStartsT; + gatherHalo->rowStartsT[0] = 0; + for (dlong i=0;iNrowsT;i++) { + gatherHalo->rowStartsT[i+1] = gatherHalo->rowStartsT[i] + haloGatherTCounts[i]; + haloGatherTCounts[i] = 0; + } + gatherHalo->nnzT = gatherHalo->rowStartsT[gatherHalo->NrowsT]; + gatherHalo->nnzN = gatherHalo->nnzT; + gatherHalo->colIdsT.malloc(gatherHalo->nnzT); + gatherHalo->colIdsN = gatherHalo->colIdsT; - ogs::gsFree(gsh); - ogs::Nrefs--; - if (!ogs::Nrefs) ogs::freeKernels(); -} + for (dlong i=0;irowStartsT[gid]; + const int sindex = localGatherTCounts[gid]; + gatherLocal->colIdsT[soffset+sindex] = nodes[i].localId; + localGatherTCounts[gid]++; + } else { + const dlong soffset = gatherHalo->rowStartsT[gid]; + const int sindex = haloGatherTCounts[gid]; + gatherHalo->colIdsT[soffset+sindex] = nodes[i].localId; + haloGatherTCounts[gid]++; } } -} + localGatherTCounts.free(); + haloGatherTCounts.free(); -void ogs_t::reallocOccaBuffer(size_t Nbytes) { - if (Nhalo) { - if (o_haloBuf.size() < Nhalo*Nbytes) { - if (o_haloBuf.size()) o_haloBuf.free(); - haloBuf = platform.hostMalloc(Nhalo*Nbytes, nullptr, h_haloBuf); - o_haloBuf = platform.malloc(Nhalo*Nbytes); - } - } + gatherLocal->o_rowStartsT = platform.malloc(gatherLocal->rowStartsT); + gatherLocal->o_rowStartsN = gatherLocal->o_rowStartsT; + gatherLocal->o_colIdsT = platform.malloc(gatherLocal->colIdsT); + gatherLocal->o_colIdsN = gatherLocal->o_colIdsT; + + gatherHalo->o_rowStartsT = platform.malloc(gatherHalo->rowStartsT); + gatherHalo->o_rowStartsN = gatherHalo->o_rowStartsT; + gatherHalo->o_colIdsT = platform.malloc(gatherHalo->colIdsT); + gatherHalo->o_colIdsN = gatherHalo->o_colIdsT; + + //divide the list of colIds into roughly equal sized blocks so that each + // threadblock loads approximately an equal amount of data + gatherLocal->setupRowBlocks(); + gatherHalo->setupRowBlocks(); } -void setupRowBlocks(ogsData_t &A, platform_t &platform) { +//Make local and halo gather operators using nodes list +void ogsBase_t::LocalHaloSetup(const dlong Nids, memory &nodes){ - dlong blockSum=0; - A.NrowBlocks=0; - if (A.Nrows) A.NrowBlocks++; - for (dlong i=0;i(platform); + gatherHalo->kind = Signed; - if (rowSize > ogs::gatherNodesPerBlock) { - //this row is pathalogically big. We can't currently run this - stringstream ss; - ss << "Multiplicity of global node id: " << i << "in ogsSetup is too large."; - LIBP_ABORT(ss.str()) - } + gatherHalo->Ncols = N; - if (blockSum+rowSize > ogs::gatherNodesPerBlock) { //adding this row will exceed the nnz per block - A.NrowBlocks++; //count the previous block - blockSum=rowSize; //start a new row block - } else { - blockSum+=rowSize; //add this row to the block + gatherHalo->NrowsN = NhaloP; + gatherHalo->NrowsT = NhaloT; + + //tally up how many nodes are being gathered to each gatherNode and + // map to a local ordering + memory haloGatherNCounts(gatherHalo->NrowsT,0); + memory haloGatherTCounts(gatherHalo->NrowsT,0); + + for (dlong i=0;irowStartsN.malloc(gatherHalo->NrowsT+1); + gatherHalo->rowStartsT.malloc(gatherHalo->NrowsT+1); + gatherHalo->rowStartsN[0]=0; + gatherHalo->rowStartsT[0]=0; + for (dlong i=0;iNrowsT;i++) { + gatherHalo->rowStartsN[i+1] = gatherHalo->rowStartsN[i] + haloGatherNCounts[i]; + gatherHalo->rowStartsT[i+1] = gatherHalo->rowStartsT[i] + haloGatherTCounts[i]; + haloGatherNCounts[i] = 0; + haloGatherTCounts[i] = 0; + } + gatherHalo->nnzN = gatherHalo->rowStartsN[gatherHalo->NrowsT]; + gatherHalo->nnzT = gatherHalo->rowStartsT[gatherHalo->NrowsT]; + gatherHalo->colIdsN.malloc(gatherHalo->nnzN); + gatherHalo->colIdsT.malloc(gatherHalo->nnzT); - blockSum=0; - A.NrowBlocks=0; - if (A.Nrows) A.NrowBlocks++; - for (dlong i=0;i ogs::gatherNodesPerBlock) { //adding this row will exceed the nnz per block - A.blockRowStarts[A.NrowBlocks++] = i; //mark the previous block - blockSum=rowSize; //start a new row block - } else { - blockSum+=rowSize; //add this row to the block + for (dlong i=0;irowStartsN[gid]; + const int sindex = haloGatherNCounts[gid]; + gatherHalo->colIdsN[soffset+sindex] = nodes[i].localId; + haloGatherNCounts[gid]++; + } + const dlong soffset = gatherHalo->rowStartsT[gid]; + const int sindex = haloGatherTCounts[gid]; + gatherHalo->colIdsT[soffset+sindex] = nodes[i].localId; + haloGatherTCounts[gid]++; } } - A.blockRowStarts[A.NrowBlocks] = A.Nrows; + haloGatherNCounts.free(); + haloGatherTCounts.free(); + + gatherHalo->o_rowStartsN = platform.malloc(gatherHalo->rowStartsN); + gatherHalo->o_rowStartsT = platform.malloc(gatherHalo->rowStartsT); + gatherHalo->o_colIdsN = platform.malloc(gatherHalo->colIdsN); + gatherHalo->o_colIdsT = platform.malloc(gatherHalo->colIdsT); + + //divide the list of colIds into roughly equal sized blocks so that each + // threadblock loads approximately an equal amount of data + gatherHalo->setupRowBlocks(); +} + +void ogsBase_t::Free() { + comm.Free(); + gatherLocal = nullptr; + gatherHalo = nullptr; + exchange = nullptr; + N=0; + NlocalT=0; + NhaloT=0; + Ngather=0; + NgatherGlobal=0; +} + +void ogsBase_t::AssertGatherDefined() { + LIBP_ABORT("Gather operation not well-defined.", + !gather_defined); +} + +//Populate the local mapping of the original ids and the gathered ordering +void ogs_t::SetupGlobalToLocalMapping(memory GlobalToLocal) { + + LIBP_ABORT("ogs handle is not set up.", + NgatherGlobal==0); + + //Note: Must have GlobalToLocal have N entries. + + memory ids(NlocalT+NhaloT); + + for (dlong n=0;nScatter(GlobalToLocal, ids, + 1, NoTrans); + gatherHalo->Scatter(GlobalToLocal, ids+NlocalT, + 1, NoTrans); +} + +void halo_t::SetupFromGather(ogs_t& ogs) { + + ogs.AssertGatherDefined(); + + platform = ogs.platform; + comm = ogs.comm; + + N = ogs.NlocalT + ogs.NhaloT; + + Ngather = Ngather; + Nhalo = ogs.NhaloT - ogs.NhaloP; + + NgatherGlobal = ogs.NgatherGlobal; + + kind = Halo; + unique = ogs.unique; + + NlocalP = ogs.NlocalP; + NlocalT = ogs.NlocalT; + + NhaloP = ogs.NhaloP; + NhaloT = ogs.NhaloT; + + gather_defined=false; + + gathered_halo=true; + + exchange = ogs.exchange; +} + +} //namespace ogs - A.o_blockRowStarts = platform.malloc((A.NrowBlocks+1)*sizeof(dlong), A.blockRowStarts); -} \ No newline at end of file +} //namespace libp diff --git a/libs/ogs/ogsUtils.cpp b/libs/ogs/ogsUtils.cpp new file mode 100644 index 000000000..69c469a02 --- /dev/null +++ b/libs/ogs/ogsUtils.cpp @@ -0,0 +1,127 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include +#include "ogs.hpp" +#include "ogs/ogsOperator.hpp" +#include "ogs/ogsExchange.hpp" +#include "ogs/ogsUtils.hpp" + +namespace libp { + +namespace ogs { + +stream_t ogsBase_t::dataStream; + +kernel_t ogsOperator_t::gatherScatterKernel[4][4]; +kernel_t ogsOperator_t::gatherKernel[4][4]; +kernel_t ogsOperator_t::scatterKernel[4]; + +kernel_t ogsExchange_t::extractKernel[4]; + + +void InitializeKernels(platform_t& platform, const Type type, const Op op) { + + //check if the gather kernel is initialized + if (!ogsOperator_t::gatherKernel[type][op].isInitialized()) { + + properties_t kernelInfo = platform.props(); + + kernelInfo["defines/p_blockSize"] = ogsOperator_t::blockSize; + kernelInfo["defines/p_gatherNodesPerBlock"] = ogsOperator_t::gatherNodesPerBlock; + + switch (type) { + case Float: kernelInfo["defines/T"] = "float"; break; + case Double: kernelInfo["defines/T"] = "double"; break; + case Int32: kernelInfo["defines/T"] = "int32_t"; break; + case Int64: kernelInfo["defines/T"] = "int64_t"; break; + } + + switch (type) { + case Float: + switch (op) { + case Add: kernelInfo["defines/OGS_OP_INIT"] = float{0}; break; + case Mul: kernelInfo["defines/OGS_OP_INIT"] = float{1}; break; + case Min: kernelInfo["defines/OGS_OP_INIT"] = std::numeric_limits::max(); break; + case Max: kernelInfo["defines/OGS_OP_INIT"] = -std::numeric_limits::max(); break; + } + break; + case Double: + switch (op) { + case Add: kernelInfo["defines/OGS_OP_INIT"] = double{0}; break; + case Mul: kernelInfo["defines/OGS_OP_INIT"] = double{1}; break; + case Min: kernelInfo["defines/OGS_OP_INIT"] = std::numeric_limits::max(); break; + case Max: kernelInfo["defines/OGS_OP_INIT"] = -std::numeric_limits::max(); break; + } + break; + case Int32: + switch (op) { + case Add: kernelInfo["defines/OGS_OP_INIT"] = int32_t{0}; break; + case Mul: kernelInfo["defines/OGS_OP_INIT"] = int32_t{1}; break; + case Min: kernelInfo["defines/OGS_OP_INIT"] = std::numeric_limits::max(); break; + case Max: kernelInfo["defines/OGS_OP_INIT"] = -std::numeric_limits::max(); break; + } + break; + case Int64: + switch (op) { + case Add: kernelInfo["defines/OGS_OP_INIT"] = int64_t{0}; break; + case Mul: kernelInfo["defines/OGS_OP_INIT"] = int64_t{1}; break; + case Min: kernelInfo["defines/OGS_OP_INIT"] = std::numeric_limits::max(); break; + case Max: kernelInfo["defines/OGS_OP_INIT"] = -std::numeric_limits::max(); break; + } + break; + } + + switch (op) { + case Add: kernelInfo["defines/OGS_OP(a,b)"] = "a+=b"; break; + case Mul: kernelInfo["defines/OGS_OP(a,b)"] = "a*=b"; break; + case Min: kernelInfo["defines/OGS_OP(a,b)"] = "if(b(Nverts); + gcomm.Allreduce(gNVertsGlobal); + + /*Get global element count offsets*/ + hlong localNverts=static_cast(Nverts); + gcomm.Scan(localNverts, gVoffsetU); + gVoffsetL = gVoffsetU-Nverts; + + /* build list of faces */ + memory faces(Nelements*Nfaces); + + for(dlong e=0;e()); + + faces[id].element = e + gVoffsetL; + faces[id].face = f; + + faces[id].elementN= -1; + faces[id].faceN = -1; + } + } + + /* sort faces by their vertex number pairs */ + std::sort(faces.ptr(), faces.ptr()+Nelements*Nfaces, + [&](const parallelFace_t& a, const parallelFace_t& b) { + return std::lexicographical_compare(a.v, a.v+NfaceVerts, + b.v, b.v+NfaceVerts); + }); + + /* scan through sorted face lists looking for adjacent + faces that have the same vertex ids */ + for(dlong n=0;n b.element) return false; + + return (a.face < b.face); + }); + + /* extract the element to element and element to face connectivity */ + + // count # of elements to send to each rank based on + // minimum {vertex id % gsize} + memory Nsend(gsize, 0); + memory Nrecv(gsize); + memory sendOffsets(gsize); + memory recvOffsets(gsize); + + int allNsend=0; + for(dlong e=0;e-1) { /*matched face*/ + elements[e].E[f] = faces[id].elementN; //global id + elements[e].F[f] = faces[id].faceN; + } else { /*unmatched*/ + elements[e].E[f] = -1; //global id + elements[e].F[f] = -1; /*mark face*/ + + // find rank of destination for sorting based on min(face vertices)%gsize + int destRank = static_cast(faces[id].v[0]%gsize); + + // increment send gsize for + ++Nsend[destRank]; + ++allNsend; + } + } + } + + // find send offsets + sendOffsets[0]=0; + for(int rr=1;rr sendFaces(allNsend); + + // pack face data + for(dlong e=0;e(faces[id].v[0]%gsize); + + // populate face to send out staged in segment of sendFaces array + const int sid = sendOffsets[destRank]+Nsend[destRank]; + sendFaces[sid] = faces[id]; + sendFaces[sid].rank = grank; + ++Nsend[destRank]; + } + } + } + faces.free(); + + // exchange counts + gcomm.Alltoall(Nsend, Nrecv); + + // count incoming faces + int allNrecv = 0; + for(int rr=0;rr recvFaces(allNrecv); + + // exchange parallel faces + gcomm.Alltoallv(sendFaces, Nsend, sendOffsets, + recvFaces, Nrecv, recvOffsets); + + // local sort allNrecv received faces + std::sort(recvFaces.ptr(), recvFaces.ptr()+allNrecv, + [&](const parallelFace_t& a, const parallelFace_t& b) { + return std::lexicographical_compare(a.v, a.v+NfaceVerts, + b.v, b.v+NfaceVerts); + }); + + // find matches + for(int n=0;n b.rank) return false; + + if(a.element < b.element) return true; + if(a.element > b.element) return false; + + return (a.face < b.face); + }); + + // send faces back from whence they came + gcomm.Alltoallv(recvFaces, Nrecv, recvOffsets, + sendFaces, Nsend, sendOffsets); + + // extract connectivity info + for(int cnt=0;cnt(sendFaces[cnt].element-gVoffsetL); + hlong eN = sendFaces[cnt].elementN; + int f = sendFaces[cnt].face; + int fN = sendFaces[cnt].faceN; + + if(eN>=0 && fN>=0){ /*match found*/ + elements[e].E[f] = eN; + elements[e].F[f] = fN; + } + } +} + +} //namespace paradogs + +} //namespace libp diff --git a/libs/parAdogs/parAdogsCuthillMckee.cpp b/libs/parAdogs/parAdogsCuthillMckee.cpp new file mode 100644 index 000000000..22e3cabf7 --- /dev/null +++ b/libs/parAdogs/parAdogsCuthillMckee.cpp @@ -0,0 +1,153 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "parAdogs.hpp" +#include "parAdogs/parAdogsGraph.hpp" +#include "parAdogs/parAdogsPartition.hpp" +#include + +namespace libp { + +namespace paradogs { + +void graph_t::CuthillMckee() { + + /*Look for first node with lowest degree*/ + int minDegree=Nfaces+1; + dlong minloc=-1; + for (dlong e=0;e=gVoffsetL) && (eN q; + + memory newId(Nelements); //TODO halo region here + + /*mark nodes as unvisted*/ + memory visited(Nelements, false); + + /*Start with minimal degree element*/ + q.push(minloc); + visited[minloc] = true; + + dlong cnt=0; + do { + if (q.empty()) { + if (cnt==Nelements){ + break; //Done + } else { + /*Disconnected? Pick another random node and try to keep growing*/ + minDegree=Nfaces+1; + minloc=-1; + for (dlong e=0;e=gVoffsetL) && (eN=gVoffsetL) && (eN(eN-gVoffsetL); //local id + if (visited[eL]==false) { + q.push(eL); + visited[eL]=true; + } + } + } + } while(true); + + /*we now have a new local odering*/ + + /*Share the new ids*/ + //TODO halo exchange here + + /*Update connectivity*/ + for(dlong e=0;e=gVoffsetL) && (eN(eN-gVoffsetL); + elements[e].E[f] = newId[eL]; + } else { + /*Need to think about how to update. Maybe it's easier to wrangle the graph for this? */ + } + } + } + } + + /*Permute local arrays to new ordering*/ + for(dlong e=0;e(newId[e]-gVoffsetL); + while (pe!=e) { + //swap + std::swap(elements[e], elements[pe]); + + std::swap(newId[e], newId[pe]); + pe = static_cast(newId[e]-gVoffsetL); + } + } +} + +} //namespace paradogs + +} //namespace libp + diff --git a/libs/parAdogs/parAdogsFiedlerVector.cpp b/libs/parAdogs/parAdogsFiedlerVector.cpp new file mode 100644 index 000000000..99de2a028 --- /dev/null +++ b/libs/parAdogs/parAdogsFiedlerVector.cpp @@ -0,0 +1,176 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "parAdogs.hpp" +#include "parAdogs/parAdogsGraph.hpp" +#include + +extern "C" { + void dsyev_ (char *JOBZ, char *UPLO, int *N, double *A, int *LDA, double *W, double *WORK, int *LWORK, int *INFO); +} + +namespace libp { + +namespace paradogs { + +/*Compute Fiedler vector of graph via multilevel heirarchy*/ +memory& graph_t::FiedlerVector() { + + /*Fiedler vector on coarsest level*/ + L[Nlevels-1].FiedlerVector(); + + /*Project and improve the Fiedler vector to the fine level*/ + for (int l=Nlevels-2;l>=0;--l) { + /*Prolongate Fiedler vector to fine graph*/ + L[l].P.SpMV(1.0, L[l+1].Fiedler, 0.0, L[l].Fiedler); + + /*Refine the Fiedler vector*/ + Refine(l); + } + + return L[0].Fiedler; +} + + + +/*Compute Fiedler vector of graph Laplacian*/ +void mgLevel_t::FiedlerVector() { + + const int N = static_cast(A.Nrows); + + int size = A.comm.size(); + memory counts(size); + memory offsets(size); + + //collect partitioning info + A.comm.Allgather(N, counts); + + int Ntotal=0; + for (int r=0;r localA(N*Ntotal, 0.0); + + /*Add sparse entries*/ + #pragma omp parallel for + for (int n=0;n(A.diag.rowStarts[n]); + const int end = static_cast(A.diag.rowStarts[n+1]); + for (int m=start;m(A.diag.cols[m] + A.colOffsetL); + localA[n*Ntotal+col] += A.diag.vals[m]; + } + } + #pragma omp parallel for + for (int n=0;n(A.offd.rows[n]); + const int start = static_cast(A.offd.mRowStarts[n]); + const int end = static_cast(A.offd.mRowStarts[n+1]); + for (int m=start;m(A.colMap[A.offd.cols[m]]); + localA[row*Ntotal+col] += A.offd.vals[m]; + } + } + + //assemble the full matrix + memory M(Ntotal*Ntotal); + + for (int r=0;r W(Ntotal); + dsyev_(&JOBZ, &UPLO, &Ntotal, M.ptr(), &LDA, W.ptr(), &WORKSIZE, &LWORK, &INFO); //Size query + + LWORK = int(WORKSIZE); + double *WORK= new double[LWORK]; + dsyev_(&JOBZ, &UPLO, &Ntotal, M.ptr(), &LDA, W.ptr(), WORK, &LWORK, &INFO); + delete[] WORK; + + LIBP_ABORT("Paradogs: dsyev_ reports info = " << INFO << " in FiedlerVector", + INFO); + + /*Find the second smallest eigenvalue (the smallest is 0)*/ + double min0 = std::numeric_limits::max(); + double min1 = std::numeric_limits::max(); + int minloc0 = -1; + int minloc1 = -1; + for (int i=0;i minV = M + minloc1*Ntotal; + for (int i=0;i& faceVertices, + const memory& EToV, + const memory& EX, + const memory& EY, + const memory& EZ, + comm_t _comm): + platform(_platform), + Nverts(_Nelements), + Nelements(_Nelements), + dim(_dim), + Nfaces(_Nfaces), + NelementVerts(_Nverts), + NfaceVerts(_NfaceVerts) { + + gcomm = _comm.Dup(); + grank = gcomm.rank(); + gsize = gcomm.size(); + + comm = _comm.Dup(); + rank = comm.rank(); + size = comm.size(); + + for (int n=0;n(Nverts); + comm.Allreduce(NVertsGlobal); + + /*Get global element count offsets*/ + hlong localNverts=static_cast(Nverts); + comm.Scan(localNverts, VoffsetU); + VoffsetL = VoffsetU-Nverts; + + gNVertsGlobal = NVertsGlobal; + gVoffsetL = VoffsetL; + gVoffsetU = VoffsetU; + + /*Create array of packed element data*/ + elements.malloc(Nelements); + + if (dim==2) { + for (dlong e=0;e& partition) { + + /*Count how much of each partition we have locally*/ + dlong Nverts0=0; + dlong Nverts1=0; + for (dlong n=0;n(Nverts0); + hlong globalNverts1=static_cast(Nverts1); + comm.Allreduce(globalNverts0); + comm.Allreduce(globalNverts1); + + /*Get offsets of partitions on each rank*/ + memory starts0(size+1); + memory starts1(size+1); + starts0[0]=0; + starts1[0]=0; + hlong localNverts0 = static_cast(Nverts0); + hlong localNverts1 = static_cast(Nverts1); + comm.Allgather(localNverts0, starts0+1); + comm.Allgather(localNverts1, starts1+1); + + for(int r=0;r(globalNverts0 - chunk0*size0); + const int remainder1 = static_cast(globalNverts1 - chunk1*size1); + + memory Nsend0(size,0); + memory Nsend1(size,0); + memory Nrecv0(size); + memory Nrecv1(size); + memory sendOffsets0(size); + memory sendOffsets1(size); + memory recvOffsets0(size); + memory recvOffsets1(size); + + memory newIds(Nverts+Nhalo); + + /*Determine new ids and send counts*/ + dlong cnt0=0; + dlong cnt1=0; + for(dlong e=0;e=VoffsetL && gE(gE-VoffsetL); + } else { /*halo neighbor*/ + eN = colIds[cnt++]; /*Get the local id in the halo (we make this when building the Laplacian)*/ + } + + const int partN = partition[eN]; + if (partN==part) { /*If both elements are in the same partition*/ + elements[e].E[f] = newIds[eN]; /*Re index*/ + } else { + elements[e].E[f] = -1;/*else break connections across the partitions*/ + } + } + } + } + } + newIds.free(); + + // find send offsets + sendOffsets0[0]=0; + sendOffsets1[0]=0; + for(int r=1;r sendElements0(NsendTotal0); + memory sendElements1(NsendTotal1); + + cnt0=0; + cnt1=0; + for(dlong e=0;e null; + + // exchange elements + if (rank(Nverts); + comm.Allreduce(NVertsGlobal); + + /*Get global element count offsets*/ + hlong localNverts=static_cast(Nverts); + comm.Scan(localNverts, VoffsetU); + VoffsetL = VoffsetU-Nverts; +} + +void graph_t::Report() { + + /* Min,Avg,Max Element counts*/ + hlong globalNverts = static_cast(Nverts); + gcomm.Allreduce(globalNverts); + dfloat avgNverts = static_cast(globalNverts)/gsize; + + dlong minNverts=Nverts; + dlong maxNverts=Nverts; + gcomm.Allreduce(minNverts, Comm::Min); + gcomm.Allreduce(maxNverts, Comm::Max); + + + dlong cut=0.0; + for (dlong n=0;n=gVoffsetU) ) { + cut++; + } + } + } + } + + hlong gCut = static_cast(cut); + gcomm.Allreduce(gCut); + hlong avgCut = gCut/gsize; + + dlong minCut=cut; + dlong maxCut=cut; + gcomm.Allreduce(minCut, Comm::Min); + gcomm.Allreduce(maxCut, Comm::Max); + + if(grank==0) { + printf("--------------------------------------ParAdogs Report------------------------------------------\n"); + printf("-----------------------------------------------------------------------------------------------\n"); + printf(" Nranks | Elements | Per Rank Elements | Halo Faces | Per Rank Halo Faces |\n"); + printf(" | | (min,avg,max) | | (min,avg,max) |\n"); + printf("-----------------------------------------------------------------------------------------------\n"); + printf( "%9d | %11lld | %13lld | %12lld | %13lld |\n", + gsize, + static_cast(globalNverts), + static_cast(minNverts), + static_cast(gCut), + static_cast(minCut)); + printf(" | | %13lld | | %13lld |\n", + static_cast(avgNverts), + static_cast(avgCut)); + printf(" | | %13lld | | %13lld |\n", + static_cast(maxNverts), + static_cast(maxCut)); + printf("-----------------------------------------------------------------------------------------------\n"); + } +} + +void graph_t::ExtractMesh(dlong &Nelements_, + memory& EToV, + memory& EToE, + memory& EToF, + memory& EX, + memory& EY, + memory& EZ) { + + /*Destroy any exiting mesh data and create new data from current graph*/ + Nelements_ = Nelements; + + EToV.malloc(Nelements*NelementVerts); + EToE.malloc(Nelements*NelementVerts); + EToF.malloc(Nelements*NelementVerts); + + EX.malloc(Nelements*NelementVerts); + EY.malloc(Nelements*NelementVerts); + if (dim==3) + EZ.malloc(Nelements*NelementVerts); + + if (dim==2) { + for (dlong e=0;e partition(Nverts); + + memory I; + I.calloc(9); + + memory x, y, z; + + if (dim==2) { + x.malloc(Nverts); + y.malloc(Nverts); + + /*Compute center of mass of each element*/ + for (dlong e=0;e avg(2); + + avg[0]=0.0; + avg[1]=0.0; + for (dlong e=0;e avg(3); + + avg[0]=0.0; + avg[1]=0.0; + avg[2]=0.0; + for (dlong e=0;emax) { + max = W[i]; + maxloc = i; + } + } + // printf("max = %f, maxloc = %d \n", max, maxloc); + + /*Princial axis is the eigenvector with largest eigenvalue*/ + double a[3]; + memory maxV = I + maxloc*N; + for (int i=0;i F(Nverts); + + if (dim==2) { + for (dlong e=0;e(size0)/size; + bipartitionFraction[1] = 1.0 - bipartitionFraction[0]; + + /*Bipartition and redistribute, update size*/ + InertialBipartition(bipartitionFraction); + + /*Recursive call*/ + InertialPartition(); +} + +} //namespace paradogs + +} //namespace libp diff --git a/libs/parAdogs/parAdogsMatrix.cpp b/libs/parAdogs/parAdogsMatrix.cpp new file mode 100644 index 000000000..cd6b4f543 --- /dev/null +++ b/libs/parAdogs/parAdogsMatrix.cpp @@ -0,0 +1,416 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "parAdogs.hpp" +#include "parAdogs/parAdogsMatrix.hpp" +#include +#include + +#ifdef GLIBCXX_PARALLEL +#include +using __gnu_parallel::sort; +#else +using std::sort; +#endif + +namespace libp { + +namespace paradogs { + +std::mt19937 RNG; + +//------------------------------------------------------------------------ +// +// parCSR matrix +// +//------------------------------------------------------------------------ + +void parCSR::SpMV(const dfloat alpha, memory& x, + const dfloat beta, memory& y) { + + halo.ExchangeStart(x, 1); + + // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j]) + #pragma omp parallel for + for(dlong i=0; i& x, + const dfloat beta, const memory& y, memory& z) { + + halo.ExchangeStart(x, 1); + + // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j]) + #pragma omp parallel for + for(dlong i=0; i& entries, + const platform_t &_platform, + comm_t _comm): + platform(_platform), + comm(_comm) { + + Nrows = _Nrows; + Ncols = _Ncols; + + /*Get global row/col offsets*/ + hlong localNrows = static_cast(Nrows); + hlong localNcols = static_cast(Ncols); + comm.Scan(localNrows, rowOffsetU); + comm.Scan(localNcols, colOffsetU); + rowOffsetL = rowOffsetU-Nrows; + colOffsetL = colOffsetU-Ncols; + + diag.rowStarts.malloc(Nrows+1); + offd.rowStarts.malloc(Nrows+1); + + #pragma omp parallel for + for (dlong n=0;n(entries[n].row-rowOffsetL); + if ( (entries[n].col < colOffsetL) + || (entries[n].col >= colOffsetU)) { + offd.rowStarts[row+1]++; + } else { + diag.rowStarts[row+1]++; + } + } + + // count how many rows are shared + offd.nzRows=0; + for(dlong i=0; i0) offd.nzRows++; + + offd.rows.malloc(offd.nzRows); + offd.mRowStarts.malloc(offd.nzRows+1); + + // cumulative sum + dlong cnt=0; + offd.mRowStarts[0]=0; + for(dlong i=0; i0) { + offd.rows[cnt] = i; //record row id + offd.mRowStarts[cnt+1] = offd.mRowStarts[cnt] + offd.rowStarts[i+1]; + cnt++; + } + diag.rowStarts[i+1] += diag.rowStarts[i]; + offd.rowStarts[i+1] += offd.rowStarts[i]; + } + diag.nnz = diag.rowStarts[Nrows]; + offd.nnz = offd.rowStarts[Nrows]; + + // Halo setup + cnt=0; + memory colIds(offd.nnz); + for (dlong n=0;n= colOffsetU)) { + colIds[cnt++] = entries[n].col; + } + } + haloSetup(colIds); //setup halo, and transform colIds to a local indexing + + // //fill the CSR matrices + diag.cols.malloc(diag.nnz); + offd.cols.malloc(offd.nnz); + diag.vals.malloc(diag.nnz); + offd.vals.malloc(offd.nnz); + dlong diagCnt = 0; + dlong offdCnt = 0; + for (dlong n=0;n= colOffsetU)) { + offd.cols[offdCnt] = colIds[offdCnt]; + offd.vals[offdCnt] = entries[n].val; + offdCnt++; + } else { + diag.cols[diagCnt] = static_cast(entries[n].col-colOffsetL); + diag.vals[diagCnt] = entries[n].val; + diagCnt++; + } + } +} + +//------------------------------------------------------------------------ +// +// parCSR halo setup +// +//------------------------------------------------------------------------ + +typedef struct { + + dlong localId; + hlong globalId; + + dlong newId; + +} parallelId_t; + + +void parCSR::haloSetup(memory& colIds) { + + //collect the unique nonlocal column ids + memory parIds(offd.nnz); + + for (dlong n=0;n b.globalId) return false; + + return (a.localId < b.localId); + }); + + //count unique nonlocal column ids + dlong Noffdcols = 0; //number of unique columns + if(offd.nnz) parIds[0].newId = Noffdcols; + for (dlong n=1;n offdcols(Noffdcols); + Noffdcols = 0; + if(offd.nnz) offdcols[Noffdcols++] = parIds[0].globalId; + for (dlong n=1;n b.localId) return false; + + return (a.globalId < b.globalId); + }); + + // be careful to make sure Ncols is set at this point + NlocalCols = Ncols; + Ncols += Noffdcols; + + //make an array of all the column ids required on this rank (local first) + colMap.malloc(Ncols); + for (dlong n=0; n& null){ + + int k = 10; + + hlong Ntotal = static_cast(Nrows); + comm.Allreduce(Ntotal); + if(k > Ntotal) k = (int) Ntotal; + + // do an arnoldi + + // allocate memory for Hessenberg matrix + memory H(k*k, 0.0); + + // allocate memory for basis + memory V((k+1)*Nrows); + memory Vx(Ncols); + + /*Create rng*/ + std::uniform_real_distribution distrib(-0.5, 0.5); + + // generate a random vector for initial basis vector + for(dlong n=0; n Vj = V+j*Nrows; + memory Vjp1 = V+(j+1)*Nrows; + + //Vx = V[j] + #pragma omp parallel for + for(dlong n=0; n Vi = V+i*Nrows; + // H(i,j) = v[i]'*A*v[j] + // dfloat hij = vectorInnerProd(Nrows, V[i], V[j+1],comm); + dfloat hij=0.0; + for(dlong n=0; n WR(k); + memory WI(k); + + linAlg_t::matrixEigenValues(k, H, WR, WI); + + double RHO = 0.; + + for(int i=0; i + +namespace libp { + +namespace paradogs { + +extern std::mt19937 RNG; + +void MeshPartition(platform_t &platform, + settings_t &settings, + dlong &Nelements, + const int dim, + const int Nverts, + const int Nfaces, + const int NfaceVertices, + const memory& faceVertices, + memory& EToV, + memory& EToE, + memory& EToF, + memory& EX, + memory& EY, + memory& EZ, + comm_t comm) { + + /* Create RNG*/ + RNG = std::mt19937(comm.rank()); + + /* Create graph from mesh info*/ + graph_t graph(platform, + Nelements, + dim, + Nverts, + Nfaces, + NfaceVertices, + faceVertices, + EToV, + EX, + EY, + EZ, + comm); + + timePoint_t timeStart = GlobalTime(comm); + + if (settings.compareSetting("PARADOGS PARTITIONING", "INERTIAL")) { + /*Inertial partitioning*/ + graph.InertialPartition(); + } else if (settings.compareSetting("PARADOGS PARTITIONING", "SPECTRAL")) { + /*Connect element faces before partitioning*/ + if (comm.size()>1) graph.Connect(); + + /*Spectral partitioning*/ + graph.SpectralPartition(); + } + + /*Connect element faces after partitioning*/ + graph.Connect(); + + /*Reorder rank-local element list for better locality*/ + graph.CuthillMckee(); + + timePoint_t timeEnd = GlobalTime(comm); + double elaplsed = ElapsedTime(timeStart, timeEnd); + + /*Print some stats about the partitioning*/ + graph.Report(); + + if (comm.rank()==0) { + printf(" Partitioning time: %5.2f seconds |\n", + elaplsed); + printf("-----------------------------------------------------------------------------------------------\n"); + } + + /*Get the new mesh data*/ + graph.ExtractMesh(Nelements, + EToV, + EToE, + EToF, + EX, + EY, + EZ); +} + +} //namespace paradogs + +} //namespace libp diff --git a/libs/parAdogs/parAdogsMultigrid.cpp b/libs/parAdogs/parAdogsMultigrid.cpp new file mode 100644 index 000000000..b2d98cda5 --- /dev/null +++ b/libs/parAdogs/parAdogsMultigrid.cpp @@ -0,0 +1,94 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "parAdogs.hpp" +#include "parAdogs/parAdogsGraph.hpp" +#include "parAdogs/parAdogsMultigrid.hpp" + +namespace libp { + +namespace paradogs { + +/****************************************/ +/* Multigrid vcycle */ +/****************************************/ +void graph_t::MultigridVcycle(const int l, + memory& r, + memory& x) { + + //check for base level + if(l==Nlevels-1) { + coarseSolver.Solve(r, x); + return; + } + + mgLevel_t& Lf = L[l]; + memory& res = Lf.RES; + + mgLevel_t& Lc = L[l+1]; + memory& rC = Lc.RHS; + memory& xC = Lc.X; + + //Pre smooth and then compute res = rhs-Ax + Lf.Smooth(r, x, true); + Lf.Residual(r, x, res); + + // rhsC = P^T res + Lf.Coarsen(res, rC); + + // Recursive call + MultigridVcycle(l+1, rC, xC); + // for (int n=0;n& r, memory& x, memory& res) { + A.SpMV(-1.0, x, 1.0, r, res); +} + +void mgLevel_t::Coarsen(memory& x, memory& xC) { + R.SpMV(1.0, x, 0.0, xC); +} + +void mgLevel_t::Prolongate(memory& xC, memory& x) { + P.SpMV(1.0, xC, 1.0, x); +} + +void mgLevel_t::Smooth(memory& r, memory& x, const bool xIsZero) { + const int ChebyshevIterations=2; + A.SmoothChebyshev(r, x, lambda0, lambda1, + xIsZero, scratch, + ChebyshevIterations); +} + +} //namespace paradogs + +} //namespace libp diff --git a/libs/parAdogs/parAdogsMultigridAggregate.cpp b/libs/parAdogs/parAdogsMultigridAggregate.cpp new file mode 100644 index 000000000..892c35ff2 --- /dev/null +++ b/libs/parAdogs/parAdogsMultigridAggregate.cpp @@ -0,0 +1,315 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "parAdogs.hpp" +#include "parAdogs/parAdogsMatrix.hpp" +#include "parAdogs/parAdogsPartition.hpp" +#include + +namespace libp { + +namespace paradogs { + +extern std::mt19937 RNG; + +/*Create a vertex matching using distance-2 aggregation*/ +void parCSR::Aggregate(dlong& Nc, + const dfloat theta, + memory& FineToCoarse) { + + /*Create rng*/ + std::uniform_real_distribution<> distrib(-0.25, 0.25); + + parCSR strong(Nrows, Ncols, platform, comm); + strong.diag.rowStarts.malloc(Nrows+1); + + #pragma omp parallel for + for(dlong i=0; i theta*(sqrt(Aii*Ajj))) + strong_per_row++; + } + //non-local entries + Jstart = offd.rowStarts[i]; + Jend = offd.rowStarts[i+1]; + for(dlong jj= Jstart; jj theta*(sqrt(Aii*Ajj))) + strong_per_row++; + } + + strong.diag.rowStarts[i+1] = strong_per_row; + } + + // cumulative sum + for(dlong i=1; i theta*(sqrt(Aii*Ajj))) { + strong.diag.cols[counter] = col; + strong.diag.vals[counter++] = std::abs(diag.vals[jj]) + distrib(paradogs::RNG); + } + } + //non-local entries + Jstart = offd.rowStarts[i]; + Jend = offd.rowStarts[i+1]; + for(dlong jj= Jstart; jj theta*(sqrt(Aii*Ajj))) { + strong.diag.cols[counter] = col; + strong.diag.vals[counter++] = std::abs(offd.vals[jj]) + distrib(paradogs::RNG); + } + } + } + + memory rand(Ncols); + memory Ts(Ncols); + memory Tr(Ncols); + memory Tn(Ncols); + + /*Initialize state array*/ + /* 0 - Undecided */ + /* -1 - Not MIS */ + /* 1 - MIS */ + memory state(Ncols, 0); + + /*Use vertex degree with random noise to break ties*/ + // #pragma omp parallel for + for (dlong n=0;nsmax) || /*If neighbor is MIS node*/ + ((sk==smax)&&(rk>rmax)) || /*Else if it has a bigger weight*/ + ((sk==smax)&&(rk==rmax)&&(nk>nmax))) { /*Rare, but just in case, break tie with index number*/ + smax = sk; + rmax = rk; + nmax = nk; + } + } + Ts[n] = smax; + Tr[n] = rmax; + Tn[n] = nmax; + } + + //share results + halo.Exchange(Ts, 1); + halo.Exchange(Tr, 1); + halo.Exchange(Tn, 1); + + // second neighbours + #pragma omp parallel for + for(dlong n=0; nsmax) || /*If neighbor is MIS node*/ + ((sk==smax)&&(rk>rmax)) || /*Else if it has a bigger weight*/ + ((sk==smax)&&(rk==rmax)&&(nk>nmax))) { /*Rare, but just in case, break tie with index number*/ + smax = sk; + rmax = rk; + nmax = nk; + } + } + + // if I am the strongest among all the 1 and 2 ring neighbours + // I am an MIS node + if(nmax == colMap[n]) state[n] = 1; + + // if there is an MIS node within distance 2, I am removed + if(smax>0) state[n] = -1; + } + + //share results + halo.Exchange(state, 1); + + // if number of undecided nodes = 0, algorithm terminates + hlong cnt = 0; + for (dlong n=0;n(Nc); + hlong NcOffsetL=0, NcOffsetU=0; + comm.Scan(localNc, NcOffsetU); + NcOffsetL = NcOffsetU-Nc; + + /*Initialize Matching array*/ + Nc=0; + for(dlong i=0; irmax) || /*If edge is strongest*/ + ((rk==rmax)&&(nk>kmax))) { /*Rare, but just in case, break tie with index number*/ + cmax = FineToCoarse[k]; + rmax = rk; + kmax = nk; + } + } + } + FineToCoarse[n] = cmax; + } + } + + //share results + halo.Exchange(FineToCoarse, 1); +} + +} //namespace paradogs + +} //namespace libp diff --git a/libs/parAdogs/parAdogsMultigridCoarseSolver.cpp b/libs/parAdogs/parAdogsMultigridCoarseSolver.cpp new file mode 100644 index 000000000..74a8d0777 --- /dev/null +++ b/libs/parAdogs/parAdogsMultigridCoarseSolver.cpp @@ -0,0 +1,148 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "parAdogs.hpp" +#include "parAdogs/parAdogsGraph.hpp" + +namespace libp { + +namespace paradogs { + +void coarseSolver_t::Solve(memory& rhs, memory& x) { + + //gather the global rhs + comm.Allgatherv(rhs, N, + grhs, coarseCounts, coarseOffsets); + + #pragma omp parallel for + for (int n=0;n& null) { + + comm = A.comm; + int size = comm.size(); + + N = static_cast(A.Nrows); + Nrows = A.Nrows; + Ncols = A.Ncols; + + coarseCounts.malloc(size); + coarseOffsets.malloc(size); + + //collect partitioning info + comm.Allgather(N, coarseCounts); + + coarseTotal=0; + for (int r=0;r gnull(coarseTotal); + + comm.Allgatherv( null, N, + gnull, coarseCounts, coarseOffsets); + + //populate local dense matrix + memory localA(N*coarseTotal); + + /*Fill the matrix with the null boost*/ + #pragma omp parallel for + for (int n=0;n(A.diag.rowStarts[n]); + const int end = static_cast(A.diag.rowStarts[n+1]); + for (int m=start;m(A.diag.cols[m] + A.colOffsetL); + localA[n*coarseTotal+col] += A.diag.vals[m]; + } + } + #pragma omp parallel for + for (int n=0;n(A.offd.rows[n]); + const int start = static_cast(A.offd.mRowStarts[n]); + const int end = static_cast(A.offd.mRowStarts[n+1]); + for (int m=start;m(A.colMap[A.offd.cols[m]]); + localA[row*coarseTotal+col] += A.offd.vals[m]; + } + } + + //assemble the full matrix + memory gA(coarseTotal*coarseTotal); + + for (int r=0;r=VoffsetL && gE0) A.offd.nzRows++; + } + + A.offd.rows.malloc(A.offd.nzRows); + A.offd.mRowStarts.malloc(A.offd.nzRows+1); + + /*cumulative sum*/ + dlong cnt=0; + A.offd.mRowStarts[0]=0; + for (dlong e=0;e0) { + A.offd.rows[cnt] = e; //record row id + A.offd.mRowStarts[cnt+1] = A.offd.mRowStarts[cnt] + A.offd.rowStarts[e+1]; + cnt++; + } + A.diag.rowStarts[e+1] += A.diag.rowStarts[e]; + A.offd.rowStarts[e+1] += A.offd.rowStarts[e]; + } + A.diag.nnz = A.diag.rowStarts[Nverts]; + A.offd.nnz = A.offd.rowStarts[Nverts]; + + /*Halo setup*/ + cnt=0; + colIds.malloc(A.offd.nnz); + for (dlong e=0;e=VoffsetU) { + colIds[cnt++] = gE; + } + } + } + } + A.haloSetup(colIds); //setup halo, and transform colIds to a local indexing + Nhalo = A.Ncols-A.Nrows; /*Record how big the halo region is*/ + + /*Build connectivity*/ + A.diagA.malloc(A.Ncols); + A.diagInv.malloc(A.Nrows); + A.diag.cols.malloc(A.diag.nnz); + A.offd.cols.malloc(A.offd.nnz); + A.diag.vals.malloc(A.diag.nnz); + A.offd.vals.malloc(A.offd.nnz); + + A.diag.nnz=0; + A.offd.nnz=0; + for (dlong e=0;e=VoffsetL && gE(gE-VoffsetL); + A.diag.vals[A.diag.nnz] = -1.0; + A.diag.nnz++; + } else { + A.offd.cols[A.offd.nnz] = colIds[A.offd.nnz]; + A.offd.vals[A.offd.nnz] = -1.0; + A.offd.nnz++; + } + Ann += 1.0; + } + } + A.diagA[e] = Ann; + A.diagInv[e] = 1.0/Ann; + } + + //fill the halo region + A.halo.Exchange(A.diagA, 1); + + L[0].Nrows = A.Nrows; + L[0].Ncols = A.Ncols; + L[0].Nglobal = NVertsGlobal; + + /*Construct fine null vector*/ + L[0].null.malloc(Nverts); + + #pragma omp parallel for + for (dlong n=0;n=MAX_LEVELS); + + Lf.SetupSmoother(); + + /*Construct next level via coarsening*/ + mgLevel_t& Lc = L[Nlevels]; + Lc.CoarsenLevel(Lf, theta); + Nlevels++; + + // Increase coarsening rate as we add levels. + //See: Algebraic Multigrid On Unstructured Meshes, P Vanek, J. Mandel, M. Brezina. + theta=theta/2; + + /*Check for stalls*/ + if (Lc.Nglobal > coarseTol*Lf.Nglobal) { + LIBP_FORCE_WARNING("Paradogs: Graph coarsening stalling. Coarse graph has " << Lc.Nglobal << " nodes."); + coarseSolver.Setup(Lc.A, Lc.null); + break; + } + } while(true); + + for (int l=0;l0) { + /*Multigrid buffers*/ + RHS.malloc(Nrows); + X.malloc(Ncols); + } + + /*Scratch space*/ + scratch.malloc(2*Ncols); +} + + + +/*Coarsen a graph using an aggregation*/ +void mgLevel_t::CoarsenLevel(mgLevel_t& Lf, const dfloat theta) { + + /*Create a FineToCoarse mapping*/ + const dlong Nf = Lf.Nrows; + + /*Create a vertex matching*/ + dlong Nc=0; + memory FineToCoarse(Lf.Ncols); + Lf.A.Aggregate(Nc, theta, FineToCoarse); + + /* Tentative prolongation operator*/ + parCSR T = TentativeProlongator(Nf, Nc, + Lf.A.platform, Lf.A.comm, + FineToCoarse, + Lf.null, null); + FineToCoarse.free(); + + /* Smoothed prologontion */ + Lf.P = SmoothProlongator(Lf.A, T); + T = parCSR(); //Free T + + /* R = P^T*/ + Lf.R = Transpose(Lf.P); + Lf.Ncols = std::max(Lf.Ncols, Lf.R.Ncols); + + /*Galerkin product*/ + parCSR AP = SpMM(Lf.A, Lf.P); + A = SpMM(Lf.R, AP); + // A.GalerkinProduct(Lf.A, Lf.P); + AP= parCSR(); //Free AP + + /*fill diagonal*/ + A.diagA.malloc(A.Ncols); + A.diagInv.malloc(A.Nrows); + + #pragma omp parallel for + for (dlong i=0;i(Nrows); + A.comm.Allreduce(Nglobal); +} + +/*Free coarse levels of hierarchy*/ +void graph_t::MultigridDestroy() { + colIds.free(); + coarseSolver = coarseSolver_t(); + for (int n=Nlevels-1;n>=0;--n) L[n] = mgLevel_t(); + Nlevels=0; +} + +} //namespace paradogs + +} //namespace libp diff --git a/libs/parAdogs/parAdogsMultigridSmooth.cpp b/libs/parAdogs/parAdogsMultigridSmooth.cpp new file mode 100644 index 000000000..95752d187 --- /dev/null +++ b/libs/parAdogs/parAdogsMultigridSmooth.cpp @@ -0,0 +1,177 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "parAdogs.hpp" +#include "parAdogs/parAdogsMatrix.hpp" +#include "parAdogs/parAdogsPartition.hpp" + +namespace libp { + +namespace paradogs { + +void parCSR::SmoothChebyshev(memory& b, memory& x, + const dfloat lambda0, const dfloat lambda1, + const bool xIsZero, memory& scratch, + const int ChebyshevIterations) { + + const dfloat theta = 0.5*(lambda1+lambda0); + const dfloat delta = 0.5*(lambda1-lambda0); + const dfloat invTheta = 1.0/theta; + const dfloat sigma = theta/delta; + dfloat rho_n = 1./sigma; + dfloat rho_np1; + + memory d = scratch + 0*Ncols; + memory r = scratch + 1*Ncols; + + if(xIsZero){ //skip the Ax if x is zero + // r = D^{-1}b + // d = invTheta*r + // x = d + #pragma omp parallel for + for (dlong n=0;n recvRows(A.Ncols-A.NlocalCols); + memory sendCounts(size); + memory recvCounts(size, 0); + memory sendOffsets(size+1); + memory recvOffsets(size+1); + + memory globalRowStarts(size+1); + globalRowStarts[0]=0; + T.comm.Allgather(T.rowOffsetU, globalRowStarts+1); + + //use the colMap of A to list the needed rows of T + int r=0; + for (dlong n=A.NlocalCols;n=globalRowStarts[r+1]) r++; //assumes the halo is sorted + recvCounts[r]++; + recvRows[n-A.NlocalCols] = id; //record the row to recv + } + globalRowStarts.free(); + + //share the counts + A.comm.Alltoall(recvCounts, sendCounts); + + sendOffsets[0]=0; + recvOffsets[0]=0; + for (r=0;r sendRows(sendTotal); + + //share the rowIds + T.comm.Alltoallv(recvRows, recvCounts, recvOffsets, + sendRows, sendCounts, sendOffsets); + + //we now have a list of rows to send, count the nnz to send + dlong nnzTotal=0; + for (r=0;r(sendRows[n]-T.rowOffsetL); //local row id + sendCounts[r]+= T.diag.rowStarts[i+1]-T.diag.rowStarts[i]; //count entries in this row + sendCounts[r]+= T.offd.rowStarts[i+1]-T.offd.rowStarts[i]; //count entries in this row + } + nnzTotal += sendCounts[r]; //tally the total + } + + memory sendNonZeros(nnzTotal); + + nnzTotal=0; //reset + for (r=0;r(sendRows[n] - T.rowOffsetL); //local row id + for (dlong jj=T.diag.rowStarts[i]; jj ToffdRows(Toffdnnz); + + T.comm.Alltoallv(sendNonZeros, sendCounts, sendOffsets, + ToffdRows, recvCounts, recvOffsets); + + //clean up + sendNonZeros.free(); + sendRows.free(); + recvRows.free(); + sendCounts.free(); + recvCounts.free(); + sendOffsets.free(); + recvOffsets.free(); + + //we now have all the needed nonlocal rows (should also be sorted by row then col) + + //make an array of row offsets so we know how large each row is + memory ToffdRowOffsets(A.Ncols-A.NlocalCols+1, 0); + + dlong id=0; + for (dlong n=0;n rowStarts(A.Nrows+1, 0); + memory rowCounts(A.Nrows, 0); + + /*Count entries per row*/ + #pragma omp parallel for + for(dlong i=0; i Ptmp(NNZ); + + //count total number of nonzeros we find + dlong nnz =0; + + // Fill the intermediate form of P + // #pragma omp parallel for + for (dlong i=0;i0) nnzRow++; + for (dlong j=1;j entries(nnz); + + //compress nonzeros + nnz = 0; + if (NNZ) entries[nnz++] = Ptmp[0]; + for (dlong i=1;i recvRows(A.Ncols-A.NlocalCols); + memory sendCounts(size); + memory recvCounts(size, 0); + memory sendOffsets(size+1); + memory recvOffsets(size+1); + + memory globalRowStarts(size+1); + globalRowStarts[0]=0; + B.comm.Allgather(B.rowOffsetU, globalRowStarts+1); + + //use the colMap of A to list the needed rows of B + int r=0; + for (dlong n=A.NlocalCols;n=globalRowStarts[r+1]) r++; //assumes the halo is sorted + recvCounts[r]++; + recvRows[n-A.NlocalCols] = id; //record the row to recv + } + globalRowStarts.free(); + + //share the counts + A.comm.Alltoall(recvCounts, sendCounts); + + sendOffsets[0]=0; + recvOffsets[0]=0; + for (r=0;r sendRows(sendTotal); + + //share the rowIds + B.comm.Alltoallv(recvRows, recvCounts, recvOffsets, + sendRows, sendCounts, sendOffsets); + + //we now have a list of rows to send, count the nnz to send + dlong NNZ=0; + for (r=0;r(sendRows[n]-B.rowOffsetL); //local row id + sendCounts[r]+= B.diag.rowStarts[i+1]-B.diag.rowStarts[i]; //count entries in this row + sendCounts[r]+= B.offd.rowStarts[i+1]-B.offd.rowStarts[i]; //count entries in this row + } + NNZ += sendCounts[r]; //tally the total + } + + memory sendNonZeros(NNZ); + + NNZ=0; //reset + for (r=0;r(sendRows[n] - B.rowOffsetL); //local row id + for (dlong jj=B.diag.rowStarts[i]; jj BoffdRows(Boffdnnz); + + B.comm.Alltoallv(sendNonZeros, sendCounts, sendOffsets, + BoffdRows, recvCounts, recvOffsets); + + //clean up + sendNonZeros.free(); + sendRows.free(); + recvRows.free(); + sendCounts.free(); + recvCounts.free(); + sendOffsets.free(); + recvOffsets.free(); + + //we now have all the needed nonlocal rows (should also be sorted by row then col) + + //make an array of row offsets so we know how large each row is + memory BoffdRowOffsets(A.Ncols-A.NlocalCols+1, 0); + + dlong id=0; + for (dlong n=0;n rowStarts(A.Nrows+1, 0); + memory rowCounts(A.Nrows, 0); + + /*Count entries per row*/ + #pragma omp parallel for + for (dlong i=0;i Ctmp(NNZ); + + //count total number of nonzeros; + dlong nnz =0; + + // Fill the intermediate form of C + // #pragma omp parallel for + for (dlong i=0;i0) nnzRow++; + for (dlong j=1;j entries(nnz); + + //compress nonzeros + nnz = 0; + if (NNZ) entries[nnz++] = Ctmp[0]; + for (dlong i=1;i& FineToCoarse, + memory& FineNull, + memory& CoarseNull) { + dlong nnz = Nf; + memory entries(nnz); + + hlong localNf=static_cast(Nf); + hlong NfOffsetL=0, NfOffsetU=0; + comm.Scan(localNf, NfOffsetU); + NfOffsetL = NfOffsetU-Nf; + + /* Each entry is the CoarseNull vector entry*/ + #pragma omp parallel for + for (dlong n=0;n +using __gnu_parallel::sort; +#else +using std::sort; +#endif + +namespace libp { + +namespace paradogs { + +parCSR Transpose(const parCSR& A) { + + // MPI info + int size = A.comm.size(); + + // copy data from nonlocal entries into send buffer + memory sendNonZeros(A.offd.nnz); + for(dlong i=0;i b.row) return false; + + return a.col < b.col; + }); + + // //count number of non-zeros we're sending + memory sendCounts(size, 0); + memory recvCounts(size); + memory sendOffsets(size+1); + memory recvOffsets(size+1); + + memory globalColStarts(size+1); + globalColStarts[0]=0; + A.comm.Allgather(A.colOffsetU, globalColStarts+1); + + int r=0; + for (dlong n=0;n=globalColStarts[r+1]) r++; + sendCounts[r]++; + } + globalColStarts.free(); + + A.comm.Alltoall(sendCounts, recvCounts); + + sendOffsets[0]=0; + recvOffsets[0]=0; + for (r=0;r offdNonZeros(offdnnz); + + // receive non-local nonzeros + A.comm.Alltoallv(sendNonZeros, sendCounts, sendOffsets, + offdNonZeros, recvCounts, recvOffsets); + + //clean up + sendNonZeros.free(); + sendCounts.free(); + recvCounts.free(); + sendOffsets.free(); + recvOffsets.free(); + + dlong NNZ = A.diag.nnz+offdnnz; + + memory entries(NNZ); + + memory rowStarts(A.NlocalCols+1, 0); + memory rowCounts(A.NlocalCols, 0); + + /*Count entries per row*/ + for(dlong i=0; i(offdNonZeros[i].row-A.colOffsetL); + rowStarts[row+1]++; + } + + /*Cumulative sum*/ + for(dlong i=1; i(offdNonZeros[i].row-A.colOffsetL); + const dlong c = rowStarts[row] + rowCounts[row]; + entries[c] = offdNonZeros[i]; + rowCounts[row]++; + } + + offdNonZeros.free(); + + //sort each row by column id + #pragma omp parallel for + for(dlong i=0; i +#include + +#ifdef GLIBCXX_PARALLEL +#include +using __gnu_parallel::partition; +#else +using std::partition; +#endif + +namespace libp { + +namespace paradogs { + +static dfloat Pivot(memory& A, + const dlong left, + const dlong right, + const hlong k, + const dfloat min, + const dfloat max, + comm_t comm) { + /*Start with guessing a pivot halfway between min and max*/ + const dfloat pivot = (min+max)/2.0; + + /*Bail out if we're looking at a tiny window*/ + constexpr dfloat TOL = (sizeof(dfloat)==8) ? 1.0e-13 : 1.0E-5; + if (max-min < TOL) return pivot; + + dfloat* Am = partition(A.ptr()+left, A.ptr()+right, [pivot](const dfloat& a){ return a <= pivot; }); + + /*Get how many entries are globally <= pivot*/ + hlong localCnt = Am-A.ptr(); + hlong globalCnt = localCnt; + comm.Allreduce(globalCnt); + + if (globalCnt==k) return pivot; + + if (k& F, + const hlong k, comm_t comm) { + + /*Make a copy of input vector*/ + memory A(N); + + #pragma omp parallel for + for (dlong n=0;n::max(); + dfloat globalMax=std::numeric_limits::min(); + for (dlong n=0;n& null = L[level].null; + const dlong N = L[level].Nrows; + const dlong Ncols = L[level].Ncols; + + memory& Fiedler = L[level].Fiedler; + + /*******************************************************/ + /*Improve fine Fiedler vector via Inverse Iteration */ + /*******************************************************/ + + const dfloat RELTOL = 3.0e-1; + const dfloat CG_TOL = 1.0e-2; + + const int maxIters=1; + + memory x(Ncols); + memory scratch(3*Ncols); + memory AF = scratch; + + /*AF = A*F*/ + A.SpMV(1.0, Fiedler, 0.0, AF); + + /*theta = F^T * A * F */ + dfloat theta = 0.0; + dfloat normAF = 0.0; + for (dlong n=0;nmesh3D::OccaSetup(); +namespace paradogs { - o_D = platform.malloc(Nq*Nq*sizeof(dfloat), D); +void AddSettings(settings_t& settings) { - o_S = o_D; //dummy - o_MM = o_D; //dummy - o_sM = o_D; //dummy - o_LIFT = o_D; //dummy + settings.newSetting("PARADOGS PARTITIONING", + "INERTIAL", + "Type of Mesh partitioning", + {"NONE", "INERTIAL", "SPECTRAL"}); +} + +void ReportSettings(settings_t& settings) { - o_vgeo = platform.malloc((Nelements+totalHaloPairs)*Nvgeo*Np*sizeof(dfloat), vgeo); - o_sgeo = platform.malloc(Nelements*Nfaces*Nfp*Nsgeo*sizeof(dfloat), sgeo); - o_ggeo = platform.malloc(Nelements*Np*Nggeo*sizeof(dfloat), ggeo); + settings.reportSetting("PARADOGS PARTITIONING"); } + +} //namespace paradogs + +} //namespace libp diff --git a/libs/parAdogs/parAdogsSolve.cpp b/libs/parAdogs/parAdogsSolve.cpp new file mode 100644 index 000000000..ff848c0bc --- /dev/null +++ b/libs/parAdogs/parAdogsSolve.cpp @@ -0,0 +1,131 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "parAdogs.hpp" +#include "parAdogs/parAdogsGraph.hpp" +#include "parAdogs/parAdogsPartition.hpp" + +namespace libp { + +namespace paradogs { + +/****************************************/ +/* Solve A_{l}*x = b */ +/****************************************/ +int graph_t::Solve(const int level, + const dfloat TOL, + memory& r, + memory& x, + memory& scratch) { + + parCSR& A = L[level].A; + const dlong N = A.Nrows; + const dlong Ncols = L[level].Ncols; + + memory p = scratch + 0*Ncols; + memory Ap = scratch + 1*Ncols; + memory z = scratch + 2*Ncols; + + // register scalars + dfloat rdotz1 = 0.0; + dfloat rdotz2 = 0.0; + dfloat alpha = 0.0, beta = 0.0, pAp = 0.0; + dfloat rdotr = 1.0; + const int MAXIT = 5000; + + /* We assume that x is initialized to some guess and + r = b-A*x */ + + /*Compute x = A^{-1} b*/ + int cg_iter; + for(cg_iter=0;cg_iter 0) && (sqrt(rdotr) <= TOL))) { + break; + } + + // z = Precon^{-1} r + MultigridVcycle(level, r, z); + + // r.z + rdotz2 = rdotz1; + rdotz1 = 0.0; + for (dlong n=0;n& Fiedler = FiedlerVector(); + + /*Use Fiedler vector to bipartion graph*/ + const hlong K = std::ceil(targetFraction[0]*NVertsGlobal); + const dfloat pivot = ParallelPivot(Nverts, Fiedler, K, comm); + + memory partition(L[0].A.Ncols); + + for (dlong n=0;n(size0)/size; + bipartitionFraction[1] = 1.0 - bipartitionFraction[0]; + + /*Bipartition and redistribute, update size*/ + SpectralBipartition(bipartitionFraction); + + /*Recursive call*/ + SpectralPartition(); +} + +} //namespace paradogs + +} //namespace libp + diff --git a/libs/parAlmond/okl/SmoothChebyshev.okl b/libs/parAlmond/okl/SmoothChebyshev.okl index 583e062a0..d511ff6cd 100644 --- a/libs/parAlmond/okl/SmoothChebyshev.okl +++ b/libs/parAlmond/okl/SmoothChebyshev.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -164,4 +164,4 @@ SOFTWARE. x[n] += d_kp1; } -} \ No newline at end of file +} diff --git a/libs/parAlmond/okl/SmoothJacobi.okl b/libs/parAlmond/okl/SmoothJacobi.okl index b3c0c21d5..5f3aa6107 100644 --- a/libs/parAlmond/okl/SmoothJacobi.okl +++ b/libs/parAlmond/okl/SmoothJacobi.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -109,4 +109,4 @@ SOFTWARE. } } } -} \ No newline at end of file +} diff --git a/libs/parAlmond/okl/SpMVcsr.okl b/libs/parAlmond/okl/SpMVcsr.okl index 17ef01945..622347ffe 100644 --- a/libs/parAlmond/okl/SpMVcsr.okl +++ b/libs/parAlmond/okl/SpMVcsr.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/libs/parAlmond/okl/SpMVmcsr.okl b/libs/parAlmond/okl/SpMVmcsr.okl index 1dff7dd36..b6ec7e84a 100644 --- a/libs/parAlmond/okl/SpMVmcsr.okl +++ b/libs/parAlmond/okl/SpMVmcsr.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/libs/parAlmond/okl/dGEMV.okl b/libs/parAlmond/okl/dGEMV.okl index 971fd39b5..66a54c581 100644 --- a/libs/parAlmond/okl/dGEMV.okl +++ b/libs/parAlmond/okl/dGEMV.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/libs/parAlmond/okl/kcycleCombinedOp.okl b/libs/parAlmond/okl/kcycleCombinedOp.okl index 1ad1dd415..fddef262b 100644 --- a/libs/parAlmond/okl/kcycleCombinedOp.okl +++ b/libs/parAlmond/okl/kcycleCombinedOp.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -35,7 +35,7 @@ SOFTWARE. for(dlong n=0;n512 for(int t=0;t256 @@ -74,7 +72,6 @@ SOFTWARE. s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+256]; } } - @barrier("local"); #endif for(int t=0;t512 for(int t=0;t256 @@ -201,7 +189,6 @@ SOFTWARE. s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+256]; } } - @barrier("local"); #endif for(int t=0;t512 for(int t=0;t256 @@ -329,7 +307,6 @@ SOFTWARE. s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+256]; } } - @barrier("local"); #endif for(int t=0;t512 for(int t=0;t256 @@ -458,7 +426,6 @@ SOFTWARE. s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+256]; } } - @barrier("local"); #endif for(int t=0;t512 for(int t=0;t256 for(int t=0;t512 for(int t=0;t256 for(int t=0;t(platform, settings, comm); //build parAlmond kernels on first construction - if (Nrefs==0) buildParAlmondKernels(platform); - Nrefs++; + buildParAlmondKernels(platform); } -void parAlmond_t::Operator(occa::memory& o_rhs, occa::memory& o_x) { +void parAlmond_t::Operator(deviceMemory& o_rhs, deviceMemory& o_x) { if (multigrid->exact){ //call the linear solver int maxIter = 500; int verbose = settings.compareSetting("VERBOSE", "TRUE") ? 1 : 0; dfloat tol = 1e-8; - solver_t &A = *(multigrid->levels[0]); - (void) multigrid->linearSolver->Solve(A, *multigrid, o_x, o_rhs, tol, maxIter, verbose); + solver_t &A = multigrid->GetLevel(0); + (void) multigrid->linearSolver.Solve(A, *multigrid, o_x, o_rhs, tol, maxIter, verbose); } else { //apply a multigrid cycle multigrid->Operator(o_rhs, o_x); } } -//Add level to multigrid heirarchy -void parAlmond_t::AddLevel(multigridLevel* level) { - multigrid->AddLevel(level); -} - void parAlmond_t::Report() { - int rank; - MPI_Comm_rank(multigrid->comm, &rank); - - if(rank==0) { + if(multigrid->comm.rank()==0) { printf("-----------------------------Multigrid Report-----------------------------------------------\n"); printf("--------------------------------------------------------------------------------------------\n"); printf("Level | Type | Dimension | Per Rank Dim | nnz per row | Smoother |\n"); @@ -78,17 +73,21 @@ void parAlmond_t::Report() { } for(int lev=0; levnumLevels-1; lev++) { - if(rank==0) {printf(" %3d ", lev);fflush(stdout);} + if(multigrid->comm.rank()==0) {printf(" %3d ", lev);fflush(stdout);} multigrid->levels[lev]->Report(); } //base level multigrid->coarseSolver->Report(multigrid->numLevels-1); - if(rank==0) + if(multigrid->comm.rank()==0) printf("--------------------------------------------------------------------------------------------\n"); } +int parAlmond_t::NumLevels() { + return multigrid->numLevels; +} + dlong parAlmond_t::getNumCols(int k) { return multigrid->levels[k]->Ncols; } @@ -97,11 +96,6 @@ dlong parAlmond_t::getNumRows(int k) { return multigrid->levels[k]->Nrows; } -parAlmond_t::~parAlmond_t() { - Nrefs--; - if (Nrefs==0) freeParAlmondKernels(); - - delete multigrid; -} - } //namespace parAlmond + +} //namespace libp diff --git a/libs/parAlmond/parAlmondAMGLevel.cpp b/libs/parAlmond/parAlmondAMGLevel.cpp index 412d99cb7..ee8eb17fe 100644 --- a/libs/parAlmond/parAlmondAMGLevel.cpp +++ b/libs/parAlmond/parAlmondAMGLevel.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,14 +25,16 @@ SOFTWARE. */ #include "parAlmond.hpp" -#include "parAlmond/parAlmondMultigrid.hpp" #include "parAlmond/parAlmondAMGLevel.hpp" +namespace libp { + namespace parAlmond { -amgLevel::amgLevel(parCSR *_A, settings_t& _settings): - multigridLevel(_A->Nrows, _A->Ncols, _A->platform, _settings), - A(_A) { +amgLevel::amgLevel(parCSR& _A, settings_t& _settings): + multigridLevel(_A.Nrows, _A.Ncols, _A.platform, _settings, _A.comm) { + + A = _A; //determine smoother if (settings.compareSetting("PARALMOND SMOOTHER", "CHEBYSHEV")) { @@ -43,35 +45,29 @@ amgLevel::amgLevel(parCSR *_A, settings_t& _settings): } } -amgLevel::~amgLevel() { - if ( A) delete A; - if ( P) delete P; - if ( R) delete R; +void amgLevel::Operator(deviceMemory& o_X, deviceMemory& o_Ax){ + A.SpMV(1.0, o_X, 0.0, o_Ax); } -void amgLevel::Operator(occa::memory& o_X, occa::memory& o_Ax){ - A->SpMV(1.0, o_X, 0.0, o_Ax); +void amgLevel::coarsen (deviceMemory& o_r, deviceMemory& o_Rr){ + R.SpMV(1.0, o_r, 0.0, o_Rr); } -void amgLevel::coarsen (occa::memory& o_r, occa::memory& o_Rr){ - R->SpMV(1.0, o_r, 0.0, o_Rr); +void amgLevel::prolongate(deviceMemory& o_X, deviceMemory& o_Px){ + P.SpMV(1.0, o_X, 1.0, o_Px); } -void amgLevel::prolongate(occa::memory& o_X, occa::memory& o_Px){ - P->SpMV(1.0, o_X, 1.0, o_Px); +void amgLevel::residual (deviceMemory& o_RHS, deviceMemory& o_X, + deviceMemory& o_RES) { + A.SpMV(-1.0, o_X, 1.0, o_RHS, o_RES); } -void amgLevel::residual (occa::memory& o_RHS, occa::memory& o_X, - occa::memory& o_RES) { - A->SpMV(-1.0, o_X, 1.0, o_RHS, o_RES); -} - -void amgLevel::smooth(occa::memory& o_RHS, occa::memory& o_X, bool x_is_zero){ +void amgLevel::smooth(deviceMemory& o_RHS, deviceMemory& o_X, bool x_is_zero){ if(stype == DAMPED_JACOBI){ - A->smoothDampedJacobi(o_RHS, o_X, lambda, + A.smoothDampedJacobi(o_RHS, o_X, lambda, x_is_zero, o_scratch); } else if(stype == CHEBYSHEV){ - A->smoothChebyshev(o_RHS, o_X, lambda0, lambda1, + A.smoothChebyshev(o_RHS, o_X, lambda0, lambda1, x_is_zero, o_scratch, ChebyshevIterations); } @@ -80,61 +76,51 @@ void amgLevel::smooth(occa::memory& o_RHS, occa::memory& o_X, bool x_is_zero){ void amgLevel::setupSmoother(){ if (stype == DAMPED_JACOBI) { - lambda = (4./3.)/A->rho; + lambda = (4./3.)/A.rho; } else if (stype == CHEBYSHEV) { - lambda1 = A->rho; - lambda0 = A->rho/10.; + lambda1 = A.rho; + lambda0 = A.rho/10.; } } void amgLevel::syncToDevice(){ - A->syncToDevice(); - if (P) P->syncToDevice(); - if (R) R->syncToDevice(); + if (A.Nrows>0) A.syncToDevice(); + if (P.Nrows>0) P.syncToDevice(); + if (R.Nrows>0) R.syncToDevice(); } void amgLevel::Report() { //This setup can be called by many subcommunicators, so only // print on the global root. - int rank; - MPI_Comm_rank(A->comm, &rank); - - hlong hNrows = (hlong) Nrows; + int totalActive=(Nrows>0) ? 1:0; + A.comm.Allreduce(totalActive); - int active = (Nrows>0) ? 1:0; - int totalActive=0; - MPI_Allreduce(&active, &totalActive, 1, MPI_INT, MPI_SUM, A->comm); + dlong minNrows=Nrows, maxNrows=Nrows; + hlong totalNrows=Nrows; + A.comm.Allreduce(maxNrows, Comm::Max); + A.comm.Allreduce(totalNrows, Comm::Sum); + dfloat avgNrows = (dfloat) totalNrows/totalActive; - dlong minNrows=0, maxNrows=0; - hlong totalNrows=0; - dfloat avgNrows; - MPI_Allreduce(&Nrows, &maxNrows, 1, MPI_DLONG, MPI_MAX, A->comm); - MPI_Allreduce(&hNrows, &totalNrows, 1, MPI_HLONG, MPI_SUM, A->comm); - avgNrows = (dfloat) totalNrows/totalActive; + if (Nrows==0) minNrows=maxNrows; //set this so it's ignored for the global min + A.comm.Allreduce(minNrows, Comm::Min); - if (Nrows==0) Nrows=maxNrows; //set this so it's ignored for the global min - MPI_Allreduce(&Nrows, &minNrows, 1, MPI_DLONG, MPI_MIN, A->comm); + long long int nnz = A.diag.nnz+A.offd.nnz; + long long int minNnz=nnz, maxNnz=nnz, totalNnz=nnz; + A.comm.Allreduce(maxNnz, Comm::Max); + A.comm.Allreduce(totalNnz, Comm::Sum); - - long long int nnz; - nnz = A->diag.nnz+A->offd.nnz; - - long long int minNnz=0, maxNnz=0, totalNnz=0; - MPI_Allreduce(&nnz, &maxNnz, 1, MPI_LONG_LONG_INT, MPI_MAX, A->comm); - MPI_Allreduce(&nnz, &totalNnz, 1, MPI_LONG_LONG_INT, MPI_SUM, A->comm); - - if (nnz==0) nnz = maxNnz; //set this so it's ignored for the global min - MPI_Allreduce(&nnz, &minNnz, 1, MPI_LONG_LONG_INT, MPI_MIN, A->comm); + if (nnz==0) minNnz = maxNnz; //set this so it's ignored for the global min + A.comm.Allreduce(minNnz, Comm::Min); dfloat nnzPerRow = (Nrows==0) ? 0 : (dfloat) nnz/Nrows; - dfloat minNnzPerRow=0, maxNnzPerRow=0, avgNnzPerRow=0; - MPI_Allreduce(&nnzPerRow, &maxNnzPerRow, 1, MPI_DFLOAT, MPI_MAX, A->comm); - MPI_Allreduce(&nnzPerRow, &avgNnzPerRow, 1, MPI_DFLOAT, MPI_SUM, A->comm); + dfloat minNnzPerRow=nnzPerRow, maxNnzPerRow=nnzPerRow, avgNnzPerRow=nnzPerRow; + A.comm.Allreduce(maxNnzPerRow, Comm::Max); + A.comm.Allreduce(avgNnzPerRow, Comm::Sum); avgNnzPerRow /= totalActive; - if (Nrows==0) nnzPerRow = maxNnzPerRow; - MPI_Allreduce(&nnzPerRow, &minNnzPerRow, 1, MPI_DFLOAT, MPI_MIN, A->comm); + if (Nrows==0) minNnzPerRow = maxNnzPerRow; + A.comm.Allreduce(minNnzPerRow, Comm::Min); char smootherString[BUFSIZ]; if (stype==DAMPED_JACOBI) @@ -142,11 +128,13 @@ void amgLevel::Report() { else if (stype==CHEBYSHEV) strcpy(smootherString, "Chebyshev "); - if (rank==0){ + if (comm.rank()==0){ printf( "| parAlmond | %12lld | %12d | %13d | %s|\n", (long long int) totalNrows, minNrows, (int)minNnzPerRow, smootherString); printf(" | | | %12d | %13d | |\n", maxNrows, (int)maxNnzPerRow); printf(" | | | %12d | %13d | |\n", (int)avgNrows, (int)avgNnzPerRow); } } -} //namespace parAlmond \ No newline at end of file +} //namespace parAlmond + +} //namespace libp diff --git a/libs/parAlmond/parAlmondAMGSetup.cpp b/libs/parAlmond/parAlmondAMGSetup.cpp index a3091d2dc..ab4287764 100644 --- a/libs/parAlmond/parAlmondAMGSetup.cpp +++ b/libs/parAlmond/parAlmondAMGSetup.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,58 +26,65 @@ SOFTWARE. #include "parAlmond.hpp" #include "parAlmond/parAlmondAMGSetup.hpp" +#include "parAlmond/parAlmondCoarseSolver.hpp" + +namespace libp { namespace parAlmond { void parAlmond_t::AMGSetup(parCOO& cooA, bool nullSpace, - dfloat *nullVector, + memory nullVector, dfloat nullSpacePenalty){ - int rank; - int size; - MPI_Comm_rank(cooA.comm, &rank); - MPI_Comm_size(cooA.comm, &size); + int rank = cooA.comm.rank(); + int size = cooA.comm.size(); + + if(Comm::World().rank()==0) {printf("Setting up AMG...");fflush(stdout);} + + /*Get multigrid solver*/ + multigrid_t& mg = *multigrid; - if(rank==0) {printf("Setting up AMG...");fflush(stdout);} + /*Get coarse solver*/ + coarseSolver_t& coarse = *(mg.coarseSolver); //make csr matrix from coo input - parCSR *A = new parCSR(cooA); - A->diagSetup(); + parCSR A(cooA); + A.diagSetup(); //copy fine nullvector - dfloat *null = (dfloat *) malloc(A->Nrows*sizeof(dfloat)); - memcpy(null, nullVector, A->Nrows*sizeof(dfloat)); + memory null(A.Nrows); + null.copyFrom(nullVector, A.Nrows); // find target N at coarsest level - const int gCoarseSize = multigrid->coarseSolver->getTargetSize(); - - amgLevel *L = new amgLevel(A, settings); + const int gCoarseSize = coarse.getTargetSize(); hlong globalSize; - if (multigrid->coarsetype==COARSEEXACT) { - globalSize = L->A->globalRowStarts[size]; + if (mg.coarsetype==COARSEEXACT) { + globalSize = A.globalRowStarts[size]; } else { //COARSEOAS //OAS cares about Ncols for size - hlong localSize = A->Ncols; - MPI_Allreduce(&localSize,&globalSize,1,MPI_HLONG,MPI_SUM,A->comm); + globalSize = A.Ncols; + A.comm.Allreduce(globalSize); } + amgLevel& Lbase = mg.AddLevel(A, settings); + //if the system if already small, dont create MG levels bool done = false; if(globalSize <= gCoarseSize){ - multigrid->AddLevel(L); - multigrid->coarseSolver->setup(A, nullSpace, null, nullSpacePenalty); - multigrid->coarseSolver->syncToDevice(); - multigrid->baseLevel = multigrid->numLevels-1; - L->syncToDevice(); + mg.AllocateLevelWorkSpace(mg.numLevels-1); + coarse.setup(A, nullSpace, null, nullSpacePenalty); + coarse.syncToDevice(); + mg.baseLevel = mg.numLevels-1; + Lbase.syncToDevice(); done = true; } //TODO: make the coarsen threasholds user-provided inputs - // For now, let default to some sensible threasholds + // For now, let default to some sensible thresholds dfloat theta=0.0; - if (multigrid->strtype==RUGESTUBEN) { + if (mg.strtype==RUGESTUBEN) { theta=0.5; //default for 3D problems //See: A GPU accelerated aggregation algebraic multigrid method, R. Gandham, K. Esler, Y. Zhang. } else { // (type==SYMMETRIC) @@ -86,49 +93,56 @@ void parAlmond_t::AMGSetup(parCOO& cooA, } while(!done){ - L->setupSmoother(); + /*Get current coarsest level*/ + amgLevel& L = mg.GetLevel(mg.numLevels-1); + + /*Build smoother*/ + L.setupSmoother(); - // Create coarse level via AMG. Coarsen null vector - amgLevel* Lcoarse = coarsenAmgLevel(L, null, - multigrid->strtype, theta, - multigrid->aggtype); - multigrid->AddLevel(L); - L->syncToDevice(); + /*Create new level*/ + amgLevel& Lcoarse = mg.AddLevel(); + + /* Coarsen level via AMG. Coarsen null vector */ + Lcoarse = coarsenAmgLevel(L, null, + mg.strtype, theta, + mg.aggtype); + + mg.AllocateLevelWorkSpace(mg.numLevels-2); + L.syncToDevice(); + + parCSR& Acoarse = Lcoarse.A; // Increase coarsening rate as we add levels. //See: Algebraic Multigrid On Unstructured Meshes, P Vanek, J. Mandel, M. Brezina. - if (multigrid->strtype==SYMMETRIC) + if (mg.strtype==SYMMETRIC) theta=theta/2; hlong globalCoarseSize; - if (multigrid->coarsetype==COARSEEXACT) { - globalCoarseSize = Lcoarse->A->globalRowStarts[size];; + if (mg.coarsetype==COARSEEXACT) { + globalCoarseSize = Acoarse.globalRowStarts[size];; } else { //COARSEOAS //OAS cares about Ncols for size - hlong localSize = Lcoarse->A->Ncols; - MPI_Allreduce(&localSize,&globalCoarseSize,1,MPI_HLONG,MPI_SUM,Lcoarse->A->comm); + globalCoarseSize = Acoarse.Ncols; + Acoarse.comm.Allreduce(globalCoarseSize); } if(globalCoarseSize <= gCoarseSize || globalSize < 2*globalCoarseSize){ - if (globalSize < 2*globalCoarseSize && rank==0) { - stringstream ss; - ss << "AMG coarsening stalling, attemping coarse solver setup with dimension N=" << globalCoarseSize; - LIBP_WARNING(ss.str()); - } - multigrid->AddLevel(Lcoarse); - Lcoarse->syncToDevice(); - multigrid->coarseSolver->setup(Lcoarse->A, nullSpace, null, nullSpacePenalty); - multigrid->coarseSolver->syncToDevice(); - multigrid->baseLevel = multigrid->numLevels-1; + LIBP_WARNING("AMG coarsening stalling, attemping coarse solver setup with dimension N=" << globalCoarseSize, + globalSize < 2*globalCoarseSize && rank==0); + + mg.AllocateLevelWorkSpace(mg.numLevels-1); + Lcoarse.syncToDevice(); + coarse.setup(Acoarse, nullSpace, null, nullSpacePenalty); + coarse.syncToDevice(); + mg.baseLevel = mg.numLevels-1; break; } globalSize = globalCoarseSize; - L = Lcoarse; } - free(null); - - if(rank==0) printf("done.\n"); + if(Comm::World().rank()==0) printf("done.\n"); } } //namespace parAlmond + +} //namespace libp diff --git a/libs/parAlmond/parAlmondAMGSmoother.cpp b/libs/parAlmond/parAlmondAMGSmoother.cpp index 360799210..493f8a11c 100644 --- a/libs/parAlmond/parAlmondAMGSmoother.cpp +++ b/libs/parAlmond/parAlmondAMGSmoother.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,48 +25,49 @@ SOFTWARE. */ #include "parAlmond.hpp" -#include "parAlmond/parAlmondMultigrid.hpp" #include "parAlmond/parAlmondAMGLevel.hpp" #include "parAlmond/parAlmondKernels.hpp" +namespace libp { + namespace parAlmond { -void parCSR::smoothDampedJacobi(occa::memory& o_r, occa::memory& o_x, +void parCSR::smoothDampedJacobi(deviceMemory& o_r, deviceMemory& o_x, const dfloat lambda, bool x_is_zero, - occa::memory& o_scratch){ + deviceMemory& o_scratch){ if(x_is_zero){ // x = lambda*inv(D)*r - platform.linAlg.amxpy(Nrows, lambda, o_diagInv, o_r, 0.0, o_x); + platform.linAlg().amxpy(Nrows, lambda, o_diagInv, o_r, 0.0, o_x); return; } - occa::memory o_d = o_scratch; + deviceMemory o_d = o_scratch; - halo->ExchangeStart(o_x, 1, ogs_dfloat); + halo.ExchangeStart(o_x, 1); // d = lambda*inv(D)*(r-A*x) if (diag.NrowBlocks) SmoothJacobiCSRKernel(diag.NrowBlocks, - diag.o_blockRowStarts, diag.o_rowStarts, - diag.o_cols, diag.o_vals, - lambda, o_diagInv, - o_r, o_x, o_d); + diag.o_blockRowStarts, diag.o_rowStarts, + diag.o_cols, diag.o_vals, + lambda, o_diagInv, + o_r, o_x, o_d); - halo->ExchangeFinish(o_x, 1, ogs_dfloat); + halo.ExchangeFinish(o_x, 1); if (offd.NrowBlocks) SmoothJacobiMCSRKernel(offd.NrowBlocks, - offd.o_blockRowStarts, offd.o_mRowStarts, - offd.o_rows, offd.o_cols, offd.o_vals, - lambda, o_diagInv, o_x, o_d); + offd.o_blockRowStarts, offd.o_mRowStarts, + offd.o_rows, offd.o_cols, offd.o_vals, + lambda, o_diagInv, o_x, o_d); - platform.linAlg.axpy(Nrows, 1.0, o_d, 1.0, o_x); + platform.linAlg().axpy(Nrows, 1.0, o_d, 1.0, o_x); } -void parCSR::smoothChebyshev(occa::memory& o_b, occa::memory& o_x, +void parCSR::smoothChebyshev(deviceMemory& o_b, deviceMemory& o_x, const dfloat lambda0, const dfloat lambda1, - bool x_is_zero, occa::memory& o_scratch, + bool x_is_zero, deviceMemory& o_scratch, const int ChebyshevIterations) { const dfloat theta = 0.5*(lambda1+lambda0); @@ -76,8 +77,8 @@ void parCSR::smoothChebyshev(occa::memory& o_b, occa::memory& o_x, dfloat rho_n = 1./sigma; dfloat rho_np1; - occa::memory o_d = o_scratch + 0*Ncols*sizeof(dfloat); - occa::memory o_r = o_scratch + 1*Ncols*sizeof(dfloat); + deviceMemory o_d = o_scratch + 0*Ncols; + deviceMemory o_r = o_scratch + 1*Ncols; if(x_is_zero){ //skip the Ax if x is zero @@ -89,25 +90,25 @@ void parCSR::smoothChebyshev(occa::memory& o_b, occa::memory& o_x, o_b, o_r, o_d, o_x); } else { //r = D^{-1}(b-A*x) - halo->ExchangeStart(o_x, 1, ogs_dfloat); + halo.ExchangeStart(o_x, 1); const dfloat alpha = 0.0; const dfloat beta = 1.0; if (diag.NrowBlocks) SmoothChebyshevCSRKernel(diag.NrowBlocks, - diag.o_blockRowStarts, diag.o_rowStarts, - diag.o_cols, diag.o_vals, - alpha, beta, o_diagInv, - o_b, o_x, o_r); + diag.o_blockRowStarts, diag.o_rowStarts, + diag.o_cols, diag.o_vals, + alpha, beta, o_diagInv, + o_b, o_x, o_r); - halo->ExchangeFinish(o_x, 1, ogs_dfloat); + halo.ExchangeFinish(o_x, 1); if (offd.NrowBlocks) SmoothChebyshevMCSRKernel(offd.NrowBlocks, - offd.o_blockRowStarts, offd.o_mRowStarts, - offd.o_rows, offd.o_cols, offd.o_vals, - o_diagInv, o_x, o_r); + offd.o_blockRowStarts, offd.o_mRowStarts, + offd.o_rows, offd.o_cols, offd.o_vals, + o_diagInv, o_x, o_r); const int last_it = (ChebyshevIterations==0) ? 1 : 0; @@ -124,23 +125,22 @@ void parCSR::smoothChebyshev(occa::memory& o_b, occa::memory& o_x, const dfloat beta = 0.0; //r_k+1 = r_k - D^{-1}Ad_k - halo->ExchangeStart(o_d, 1, ogs_dfloat); + halo.ExchangeStart(o_d, 1); if (diag.NrowBlocks) SmoothChebyshevCSRKernel(diag.NrowBlocks, - diag.o_blockRowStarts, diag.o_rowStarts, - diag.o_cols, diag.o_vals, - alpha, beta, o_diagInv, - o_b, o_d, o_r); + diag.o_blockRowStarts, diag.o_rowStarts, + diag.o_cols, diag.o_vals, + alpha, beta, o_diagInv, + o_b, o_d, o_r); - halo->ExchangeFinish(o_d, 1, ogs_dfloat); + halo.ExchangeFinish(o_d, 1); if (offd.NrowBlocks) SmoothChebyshevMCSRKernel(offd.NrowBlocks, - offd.o_blockRowStarts, offd.o_mRowStarts, - offd.o_rows, offd.o_cols, offd.o_vals, - o_diagInv, o_d, o_r); - + offd.o_blockRowStarts, offd.o_mRowStarts, + offd.o_rows, offd.o_cols, offd.o_vals, + o_diagInv, o_d, o_r); const int last_it = (k==ChebyshevIterations-1) ? 1 : 0; @@ -151,7 +151,7 @@ void parCSR::smoothChebyshev(occa::memory& o_b, occa::memory& o_x, if (Nrows) SmoothChebyshevUpdateKernel(Nrows, rho_np1*rho_n, - 2.0*rho_np1/delta, + dfloat(2.0)*rho_np1/delta, last_it, o_r, o_d, o_x); @@ -160,3 +160,5 @@ void parCSR::smoothChebyshev(occa::memory& o_b, occa::memory& o_x, } } //namespace parAlmond + +} //namespace libp diff --git a/libs/parAlmond/parAlmondCoarseExact.cpp b/libs/parAlmond/parAlmondCoarseExact.cpp index 74324682f..9792216f6 100644 --- a/libs/parAlmond/parAlmondCoarseExact.cpp +++ b/libs/parAlmond/parAlmondCoarseExact.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -28,23 +28,20 @@ SOFTWARE. #include "parAlmond/parAlmondCoarseSolver.hpp" #include "parAlmond/parAlmondKernels.hpp" -//link in the data stream from ogs -namespace ogs { - extern occa::stream dataStream; -} +namespace libp { namespace parAlmond { -void exactSolver_t::solve(occa::memory& o_rhs, occa::memory& o_x) { +void exactSolver_t::solve(deviceMemory& o_rhs, deviceMemory& o_x) { - occa::stream currentStream = platform.device.getStream(); + stream_t currentStream = platform.getStream(); //queue transfering coarse vector to host for Allgather if(N) { - platform.device.finish(); - platform.device.setStream(ogs::dataStream); - o_rhs.copyTo(diagRhs, N*sizeof(dfloat), 0, "async: true"); - platform.device.setStream(currentStream); + platform.finish(); + platform.setStream(ogs::ogsBase_t::dataStream); + o_rhs.copyTo(diagRhs, N, 0, properties_t("async", true)); + platform.setStream(currentStream); } //queue local part of gemv @@ -55,19 +52,19 @@ void exactSolver_t::solve(occa::memory& o_rhs, occa::memory& o_x) { if(offdTotal) { //wait for data to arrive on host - platform.device.setStream(ogs::dataStream); - platform.device.finish(); + platform.setStream(ogs::ogsBase_t::dataStream); + platform.finish(); //gather the offd rhs entries - MPI_Alltoallv(diagRhs, sendCounts, sendOffsets, MPI_DFLOAT, - offdRhs, coarseCounts, coarseOffsets, MPI_DFLOAT, comm); + comm.Alltoallv(diagRhs, sendCounts, sendOffsets, + offdRhs, coarseCounts, coarseOffsets); //queue transfering coarse vector to device - o_offdRhs.copyFrom(offdRhs, offdTotal*sizeof(dfloat), 0, "async: true"); - platform.device.finish(); //wait for transfer to complete + o_offdRhs.copyFrom(offdRhs, offdTotal, 0, properties_t("async", true)); + platform.finish(); //wait for transfer to complete - platform.device.setStream(currentStream); + platform.setStream(currentStream); //queue offd part of gemv if (N) @@ -80,65 +77,66 @@ int exactSolver_t::getTargetSize() { return 1000; } -void exactSolver_t::setup(parCSR *_A, bool nullSpace, - dfloat *nullVector, dfloat nullSpacePenalty) { +void exactSolver_t::setup(parCSR& _A, bool nullSpace, + memory nullVector, dfloat nullSpacePenalty) { A = _A; - comm = A->comm; - MPI_Comm_rank(comm, &rank); - MPI_Comm_size(comm, &size); + comm = A.comm; + rank = comm.rank(); + size = comm.size(); //copy the global coarse partition as ints - coarseOffsets = (int* ) calloc(size+1,sizeof(int)); - for (int r=0;rglobalRowStarts[r]; + coarseOffsets.malloc(size+1); + for (int r=0;r(A.globalRowStarts[r]); + } coarseTotal = coarseOffsets[size]; coarseOffset = coarseOffsets[rank]; - N = (int) A->Nrows; - Nrows = A->Nrows; - Ncols = A->Ncols; + N = static_cast(A.Nrows); + Nrows = A.Nrows; + Ncols = A.Ncols; - coarseCounts = (int*) calloc(size,sizeof(int)); + coarseCounts.malloc(size,0); - int sendNNZ = (int) (A->diag.nnz+A->offd.nnz); + int sendNNZ = static_cast(A.diag.nnz+A.offd.nnz); // if((rank==0)&&(settings.compareSetting("VERBOSE","TRUE"))) // {printf("Setting up coarse solver...");fflush(stdout);} - parCOO::nonZero_t *sendNonZeros = (parCOO::nonZero_t *) calloc(sendNNZ, sizeof(parCOO::nonZero_t)); + memory sendNonZeros(sendNNZ); //populate matrix int cnt = 0; for (int n=0;ndiag.rowStarts[n]; - const int end = (int) A->diag.rowStarts[n+1]; + const int start = static_cast(A.diag.rowStarts[n]); + const int end = static_cast(A.diag.rowStarts[n+1]); for (int m=start;mdiag.cols[m] + coarseOffset; - sendNonZeros[cnt].val = A->diag.vals[m]; + sendNonZeros[cnt].col = A.diag.cols[m] + coarseOffset; + sendNonZeros[cnt].val = A.diag.vals[m]; cnt++; } } - for (int n=0;noffd.nzRows;n++) { - const int row = (int) A->offd.rows[n]; - const int start = (int) A->offd.mRowStarts[n]; - const int end = (int) A->offd.mRowStarts[n+1]; + for (int n=0;n(A.offd.rows[n]); + const int start = static_cast(A.offd.mRowStarts[n]); + const int end = static_cast(A.offd.mRowStarts[n+1]); for (int m=start;mcolMap[A->offd.cols[m]]; - sendNonZeros[cnt].val = A->offd.vals[m]; + sendNonZeros[cnt].col = A.colMap[A.offd.cols[m]]; + sendNonZeros[cnt].val = A.offd.vals[m]; cnt++; } } //get the nonzero counts from all ranks - int *recvNNZ = (int*) calloc(size,sizeof(int)); - int *NNZoffsets = (int*) calloc(size+1,sizeof(int)); - MPI_Allgather(&sendNNZ, 1, MPI_INT, - recvNNZ, 1, MPI_INT, comm); + memory recvNNZ(size); + memory NNZoffsets(size+1,0); + comm.Allgather(sendNNZ, recvNNZ); int totalNNZ = 0; for (int r=0;r recvNonZeros(totalNNZ); - MPI_Allgatherv(sendNonZeros, sendNNZ, MPI_NONZERO_T, - recvNonZeros, recvNNZ, NNZoffsets, MPI_NONZERO_T, comm); + comm.Allgatherv(sendNonZeros, sendNNZ, + recvNonZeros, recvNNZ, NNZoffsets); //gather null vector - dfloat *nullTotal = (dfloat*) calloc(coarseTotal,sizeof(dfloat)); + memory nullTotal(coarseTotal); - for (int r=0;r coarseA(coarseTotal*coarseTotal, 0.0); for (int i=0;i(diagInvAT); + o_offdInvAT = platform.malloc(offdInvAT); - diagRhs = (dfloat*) calloc(N,sizeof(dfloat)); - offdRhs = (dfloat*) calloc(offdTotal,sizeof(dfloat)); + diagRhs.malloc(N); + offdRhs.malloc(offdTotal); - o_offdRhs = platform.malloc(offdTotal*sizeof(dfloat)); + o_offdRhs = platform.malloc(offdTotal); // if((rank==0)&&(settings.compareSetting("VERBOSE","TRUE"))) printf("done.\n"); } @@ -241,40 +230,36 @@ void exactSolver_t::syncToDevice() {} void exactSolver_t::Report(int lev) { - hlong hNrows = (hlong) N; - - int active = (N>0) ? 1:0; - int totalActive=0; - MPI_Allreduce(&active, &totalActive, 1, MPI_INT, MPI_SUM, comm); + int totalActive = (N>0) ? 1:0; + comm.Allreduce(totalActive, Comm::Sum); - dlong minNrows=0, maxNrows=0; - hlong totalNrows=0; - dfloat avgNrows; - MPI_Allreduce(&N, &maxNrows, 1, MPI_DLONG, MPI_MAX, comm); - MPI_Allreduce(&hNrows, &totalNrows, 1, MPI_HLONG, MPI_SUM, comm); - avgNrows = (dfloat) totalNrows/totalActive; + dlong minNrows=N, maxNrows=N; + hlong totalNrows=N; + comm.Allreduce(maxNrows, Comm::Max); + comm.Allreduce(totalNrows, Comm::Sum); + dfloat avgNrows = (dfloat) totalNrows/totalActive; - if (N==0) N=maxNrows; //set this so it's ignored for the global min - MPI_Allreduce(&N, &minNrows, 1, MPI_DLONG, MPI_MIN, comm); + if (N==0) minNrows=maxNrows; //set this so it's ignored for the global min + comm.Allreduce(minNrows, Comm::Min); long long int nnz; - nnz = A->diag.nnz+A->offd.nnz; + nnz = A.diag.nnz+A.offd.nnz; - long long int minNnz=0, maxNnz=0, totalNnz=0; - MPI_Allreduce(&nnz, &maxNnz, 1, MPI_LONG_LONG_INT, MPI_MAX, A->comm); - MPI_Allreduce(&nnz, &totalNnz, 1, MPI_LONG_LONG_INT, MPI_SUM, A->comm); + long long int minNnz=nnz, maxNnz=nnz, totalNnz=nnz; + comm.Allreduce(maxNnz, Comm::Max); + comm.Allreduce(totalNnz, Comm::Sum); - if (nnz==0) nnz = maxNnz; //set this so it's ignored for the global min - MPI_Allreduce(&nnz, &minNnz, 1, MPI_LONG_LONG_INT, MPI_MIN, A->comm); + if (nnz==0) minNnz = maxNnz; //set this so it's ignored for the global min + comm.Allreduce(minNnz, Comm::Min); dfloat nnzPerRow = (Nrows==0) ? 0 : (dfloat) nnz/Nrows; - dfloat minNnzPerRow=0, maxNnzPerRow=0, avgNnzPerRow=0; - MPI_Allreduce(&nnzPerRow, &maxNnzPerRow, 1, MPI_DFLOAT, MPI_MAX, A->comm); - MPI_Allreduce(&nnzPerRow, &avgNnzPerRow, 1, MPI_DFLOAT, MPI_SUM, A->comm); + dfloat minNnzPerRow=nnzPerRow, maxNnzPerRow=nnzPerRow, avgNnzPerRow=nnzPerRow; + comm.Allreduce(maxNnzPerRow, Comm::Max); + comm.Allreduce(avgNnzPerRow, Comm::Sum); avgNnzPerRow /= totalActive; - if (Nrows==0) nnzPerRow = maxNnzPerRow; - MPI_Allreduce(&nnzPerRow, &minNnzPerRow, 1, MPI_DFLOAT, MPI_MIN, A->comm); + if (Nrows==0) minNnzPerRow = maxNnzPerRow; + comm.Allreduce(minNnzPerRow, Comm::Min); std::string name = "Exact Solve "; @@ -285,13 +270,6 @@ void exactSolver_t::Report(int lev) { } } -exactSolver_t::~exactSolver_t() { - if (coarseOffsets) free(coarseOffsets); - if (coarseCounts) free(coarseCounts); - if (diagInvAT) free(diagInvAT); - if (offdInvAT) free(offdInvAT); - if (diagRhs) free(diagRhs); - if (offdRhs) free(offdRhs); -} - } //namespace parAlmond + +} //namespace libp diff --git a/libs/parAlmond/parAlmondCoarseOAS.cpp b/libs/parAlmond/parAlmondCoarseOAS.cpp index 385015bda..031e15658 100644 --- a/libs/parAlmond/parAlmondCoarseOAS.cpp +++ b/libs/parAlmond/parAlmondCoarseOAS.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -28,11 +28,13 @@ SOFTWARE. #include "parAlmond/parAlmondCoarseSolver.hpp" #include "parAlmond/parAlmondKernels.hpp" +namespace libp { + namespace parAlmond { -void oasSolver_t::solve(occa::memory& o_rhs, occa::memory& o_x) { +void oasSolver_t::solve(deviceMemory& o_rhs, deviceMemory& o_x) { - A->halo->ExchangeStart(o_rhs, 1, ogs_dfloat); + A.halo.ExchangeStart(o_rhs, 1); //queue local part of gemv const dfloat one=1.0; @@ -40,34 +42,33 @@ void oasSolver_t::solve(occa::memory& o_rhs, occa::memory& o_x) { if (N) dGEMVKernel(N,diagTotal,one,o_diagInvAT,o_rhs, zero, o_x); - A->halo->ExchangeFinish(o_rhs, 1, ogs_dfloat); + A.halo.ExchangeFinish(o_rhs, 1); //queue offd part of gemv if(offdTotal && N) dGEMVKernel(N,offdTotal, one, o_offdInvAT, - o_rhs+diagTotal*sizeof(dfloat), one, o_x); + o_rhs+diagTotal, one, o_x); - A->halo->Combine(o_x, 1, ogs_dfloat); + A.halo.Combine(o_x, 1); } int oasSolver_t::getTargetSize() { - MPI_Comm_size(comm, &size); - return 1000*size; + return 1000*comm.size(); } -void oasSolver_t::setup(parCSR *_A, bool nullSpace, - dfloat *nullVector, dfloat nullSpacePenalty) { +void oasSolver_t::setup(parCSR& _A, bool nullSpace, + memory nullVector, dfloat nullSpacePenalty) { A = _A; - comm = A->comm; - MPI_Comm_rank(comm, &rank); - MPI_Comm_size(comm, &size); + comm = A.comm; + rank = comm.rank(); + size = comm.size(); - N = (int) A->Ncols; - Nrows = A->Nrows; - Ncols = A->Ncols; + N = static_cast(A.Ncols); + Nrows = A.Nrows; + Ncols = A.Ncols; // if((rank==0)&&(settings.compareSetting("VERBOSE","TRUE"))) // {printf("Setting up coarse solver...");fflush(stdout);} @@ -76,25 +77,24 @@ void oasSolver_t::setup(parCSR *_A, bool nullSpace, // corresponding the offd columns //need to find where to send local rows - hlong *recvRows = (hlong *) calloc(A->Ncols-A->Nrows, sizeof(hlong)); + memory recvRows(A.Ncols-A.Nrows); - int *sendCounts = (int*) calloc(size, sizeof(int)); - int *recvCounts = (int*) calloc(size, sizeof(int)); - int *sendOffsets = (int*) calloc(size+1, sizeof(int)); - int *recvOffsets = (int*) calloc(size+1, sizeof(int)); + memory sendCounts(size); + memory recvCounts(size, 0); + memory sendOffsets(size+1, 0); + memory recvOffsets(size+1, 0); //use the colMap to fill the recv sizes int r=0; - for (int n=A->Nrows;nNcols;n++) { - hlong id = A->colMap[n]; - while (id>=A->globalRowStarts[r+1]) r++; //assumes the halo is sorted + for (int n=A.Nrows;n=A.globalRowStarts[r+1]) r++; //assumes the halo is sorted recvCounts[r]++; - recvRows[n-A->Nrows] = id; //record the row to recv + recvRows[n-A.Nrows] = id; //record the row to recv } //share the counts - MPI_Alltoall(recvCounts, 1, MPI_INT, - sendCounts, 1, MPI_INT, comm); + comm.Alltoall(recvCounts, sendCounts); for (r=0;r sendRows(sendTotal); //share the rowIds - MPI_Alltoallv(recvRows, recvCounts, recvOffsets, MPI_HLONG, - sendRows, sendCounts, sendOffsets, MPI_HLONG, - comm); + comm.Alltoallv(recvRows, recvCounts, recvOffsets, + sendRows, sendCounts, sendOffsets); //we now have a list of rows to send, count the nnz to send dlong nnzTotal=0; for (r=0;rglobalRowStarts[rank]); //local row id - sendCounts[r]+= A->diag.rowStarts[i+1]-A->diag.rowStarts[i]; //count entries in this row - sendCounts[r]+= A->offd.rowStarts[i+1]-A->offd.rowStarts[i]; //count entries in this row + dlong i = static_cast(sendRows[n]-A.globalRowStarts[rank]); //local row id + sendCounts[r]+= A.diag.rowStarts[i+1]-A.diag.rowStarts[i]; //count entries in this row + sendCounts[r]+= A.offd.rowStarts[i+1]-A.offd.rowStarts[i]; //count entries in this row } nnzTotal += sendCounts[r]; //tally the total } - parCOO::nonZero_t *sendNonZeros = (parCOO::nonZero_t *) calloc(nnzTotal, sizeof(parCOO::nonZero_t)); + memory sendNonZeros(nnzTotal); nnzTotal=0; //reset for (r=0;rglobalRowStarts[rank]); //local row id - for (dlong jj=A->diag.rowStarts[i]; jjdiag.rowStarts[i+1];jj++){ + dlong i = static_cast(sendRows[n] - A.globalRowStarts[rank]); //local row id + for (dlong jj=A.diag.rowStarts[i]; jjdiag.cols[jj] + A->globalRowStarts[rank]; - sendNonZeros[nnzTotal].val = A->diag.vals[jj]; + sendNonZeros[nnzTotal].col = A.diag.cols[jj] + A.globalRowStarts[rank]; + sendNonZeros[nnzTotal].val = A.diag.vals[jj]; nnzTotal++; } - for (dlong jj=A->offd.rowStarts[i]; jjoffd.rowStarts[i+1];jj++){ + for (dlong jj=A.offd.rowStarts[i]; jjcolMap[A->offd.cols[jj]]; - sendNonZeros[nnzTotal].val = A->offd.vals[jj]; + sendNonZeros[nnzTotal].col = A.colMap[A.offd.cols[jj]]; + sendNonZeros[nnzTotal].val = A.offd.vals[jj]; nnzTotal++; } } } - MPI_Alltoall(sendCounts, 1, MPI_INT, - recvCounts, 1, MPI_INT, comm); + comm.Alltoall(sendCounts, recvCounts); for (r=0;r recvNonZeros(nnzTotal); - //clean up - MPI_Barrier(comm); - free(sendNonZeros); - free(sendCounts); - free(recvCounts); - free(sendOffsets); - free(recvOffsets); + comm.Alltoallv(sendNonZeros, sendCounts, sendOffsets, + recvNonZeros, recvCounts, recvOffsets); //we now have all the nonlocal rows (should also be sorted) //first re-index the column indices - dlong id=A->Nrows; + dlong id=A.Nrows; for (dlong n=0;ncolMap[id]!=row) id++; //shift along list of recieved columns + while(A.colMap[id]!=row) id++; //shift along list of recieved columns recvNonZeros[n].row = id; //overwrite with new local row id //now check the column index hlong col = recvNonZeros[n].col; - if (col >= A->globalRowStarts[rank] && col < A->globalRowStarts[rank+1]) {//local column - recvNonZeros[n].col = col - A->globalRowStarts[rank];//overwrite with local col id + if (col >= A.globalRowStarts[rank] && col < A.globalRowStarts[rank+1]) {//local column + recvNonZeros[n].col = col - A.globalRowStarts[rank];//overwrite with local col id } else { int flag = 0; - for (dlong jj=A->Nrows;jjNcols;jj++) { //look for the right id in the halo - if (A->colMap[jj]==col) { + for (dlong jj=A.Nrows;jjNrows;n++) { - const int start = (int) A->diag.rowStarts[n]; - const int end = (int) A->diag.rowStarts[n+1]; + memory coarseA(N*N); + for (int n=0;n(A.diag.rowStarts[n]); + const int end = static_cast(A.diag.rowStarts[n+1]); for (int m=start;mdiag.cols[m]; - coarseA[n*N+col] = A->diag.vals[m]; + int col = static_cast(A.diag.cols[m]); + coarseA[n*N+col] = A.diag.vals[m]; } } - for (int n=0;noffd.nzRows;n++) { - const int row = (int) A->offd.rows[n]; - const int start = (int) A->offd.mRowStarts[n]; - const int end = (int) A->offd.mRowStarts[n+1]; + for (int n=0;n(A.offd.rows[n]); + const int start = static_cast(A.offd.mRowStarts[n]); + const int end = static_cast(A.offd.mRowStarts[n+1]); for (int m=start;moffd.cols[m]; - coarseA[row*N+col] = A->offd.vals[m]; + int col = static_cast(A.offd.cols[m]); + coarseA[row*N+col] = A.offd.vals[m]; } } @@ -224,43 +213,36 @@ void oasSolver_t::setup(parCSR *_A, bool nullSpace, if (nullSpace) { //A is dense due to nullspace augmentation //copy fine nullvector and populate halo - dfloat *null = (dfloat *) malloc(A->Ncols*sizeof(dfloat)); - memcpy(null, nullVector, A->Nrows*sizeof(dfloat)); - A->halo->Exchange(null, 1, ogs_dfloat); + memory null(A.Ncols); + null.copyFrom(nullVector, A.Nrows); + A.halo.Exchange(null, 1); for (int n=0;n weight(N, 1.0); - A->halo->Combine(weight, 1, ogs_dfloat); + A.halo.Combine(weight, 1); for (int n=0;nNrows; - offdTotal = A->Ncols - A->Nrows; + diagTotal = A.Nrows; + offdTotal = A.Ncols - A.Nrows; //diag piece of invA - diagInvAT = (dfloat *) calloc(N*diagTotal,sizeof(dfloat)); + diagInvAT.malloc(N*diagTotal); for (int n=0;n(diagInvAT); + o_offdInvAT = platform.malloc(offdInvAT); // if((rank==0)&&(settings.compareSetting("VERBOSE","TRUE"))) printf("done.\n"); } @@ -285,40 +267,36 @@ void oasSolver_t::syncToDevice() {} void oasSolver_t::Report(int lev) { - hlong hNrows = (hlong) N; + int totalActive = (N>0) ? 1:0; + comm.Allreduce(totalActive, Comm::Sum); - int active = (N>0) ? 1:0; - int totalActive=0; - MPI_Allreduce(&active, &totalActive, 1, MPI_INT, MPI_SUM, comm); + dlong minNrows=N, maxNrows=N; + hlong totalNrows=N; + comm.Allreduce(maxNrows, Comm::Max); + comm.Allreduce(totalNrows, Comm::Sum); + dfloat avgNrows = static_cast(totalNrows)/totalActive; - dlong minNrows=0, maxNrows=0; - hlong totalNrows=0; - dfloat avgNrows; - MPI_Allreduce(&N, &maxNrows, 1, MPI_DLONG, MPI_MAX, comm); - MPI_Allreduce(&hNrows, &totalNrows, 1, MPI_HLONG, MPI_SUM, comm); - avgNrows = (dfloat) totalNrows/totalActive; - - if (N==0) N=maxNrows; //set this so it's ignored for the global min - MPI_Allreduce(&N, &minNrows, 1, MPI_DLONG, MPI_MIN, comm); + if (N==0) minNrows=maxNrows; //set this so it's ignored for the global min + comm.Allreduce(minNrows, Comm::Min); long long int nnz; - nnz = A->diag.nnz+A->offd.nnz; + nnz = A.diag.nnz+A.offd.nnz; - long long int minNnz=0, maxNnz=0, totalNnz=0; - MPI_Allreduce(&nnz, &maxNnz, 1, MPI_LONG_LONG_INT, MPI_MAX, comm); - MPI_Allreduce(&nnz, &totalNnz, 1, MPI_LONG_LONG_INT, MPI_SUM, comm); + long long int minNnz=nnz, maxNnz=nnz, totalNnz=nnz; + comm.Allreduce(maxNnz, Comm::Max); + comm.Allreduce(totalNnz, Comm::Sum); - if (nnz==0) nnz = maxNnz; //set this so it's ignored for the global min - MPI_Allreduce(&nnz, &minNnz, 1, MPI_LONG_LONG_INT, MPI_MIN, comm); + if (nnz==0) minNnz = maxNnz; //set this so it's ignored for the global min + comm.Allreduce(minNnz, Comm::Min); - dfloat nnzPerRow = (Nrows==0) ? 0 : (dfloat) nnz/Nrows; - dfloat minNnzPerRow=0, maxNnzPerRow=0, avgNnzPerRow=0; - MPI_Allreduce(&nnzPerRow, &maxNnzPerRow, 1, MPI_DFLOAT, MPI_MAX, comm); - MPI_Allreduce(&nnzPerRow, &avgNnzPerRow, 1, MPI_DFLOAT, MPI_SUM, comm); + dfloat nnzPerRow = (Nrows==0) ? 0 : static_cast(nnz)/Nrows; + dfloat minNnzPerRow=nnzPerRow, maxNnzPerRow=nnzPerRow, avgNnzPerRow=nnzPerRow; + comm.Allreduce(maxNnzPerRow, Comm::Max); + comm.Allreduce(avgNnzPerRow, Comm::Sum); avgNnzPerRow /= totalActive; - if (Nrows==0) nnzPerRow = maxNnzPerRow; - MPI_Allreduce(&nnzPerRow, &minNnzPerRow, 1, MPI_DFLOAT, MPI_MIN, comm); + if (Nrows==0) minNnzPerRow = maxNnzPerRow; + comm.Allreduce(minNnzPerRow, Comm::Min); std::string name = "OAS "; @@ -329,9 +307,6 @@ void oasSolver_t::Report(int lev) { } } -oasSolver_t::~oasSolver_t() { - if (diagInvAT) free(diagInvAT); - if (offdInvAT) free(offdInvAT); -} - } //namespace parAlmond + +} //namespace libp diff --git a/libs/parAlmond/parAlmondCoarsenLevel.cpp b/libs/parAlmond/parAlmondCoarsenLevel.cpp index 3ed7c06f2..97120f2a7 100644 --- a/libs/parAlmond/parAlmondCoarsenLevel.cpp +++ b/libs/parAlmond/parAlmondCoarsenLevel.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,62 +27,61 @@ SOFTWARE. #include "parAlmond.hpp" #include "parAlmond/parAlmondAMGSetup.hpp" +namespace libp { + namespace parAlmond { //create coarsened problem -amgLevel *coarsenAmgLevel(amgLevel *level, dfloat *null, - StrengthType strtype, dfloat theta, - AggType aggtype){ +amgLevel coarsenAmgLevel(amgLevel& level, memory& null, + StrengthType strtype, dfloat theta, + AggType aggtype){ + + parCSR& A = level.A; - int size; - MPI_Comm_size(level->A->comm, &size); + int size = A.comm.size(); - strongGraph_t *C = strongGraph(level->A, strtype, theta); + strongGraph_t C = strongGraph(A, strtype, theta); - hlong *FineToCoarse = (hlong *) malloc(level->A->Ncols*sizeof(hlong)); - hlong *globalAggStarts = (hlong *) calloc(size+1,sizeof(hlong)); + memory FineToCoarse(A.Ncols); + memory globalAggStarts(size+1); - formAggregates(level->A, C, FineToCoarse, globalAggStarts); - delete C; + formAggregates(A, C, FineToCoarse, globalAggStarts); // adjustPartition(FineToCoarse, settings); - parCSR *P; - parCSR *T = tentativeProlongator(level->A, FineToCoarse, globalAggStarts, null); + parCSR P; + parCSR T = tentativeProlongator(A, FineToCoarse, globalAggStarts, null); if (aggtype == SMOOTHED) { - P = smoothProlongator(level->A, T); - delete T; + P = smoothProlongator(A, T); } else { P = T; } // R = P^T - parCSR *R = transpose(P); + parCSR R = transpose(P); - level->P = P; - level->R = R; + level.P = P; + level.R = R; - parCSR *Acoarse; + parCSR Acoarse; if (aggtype == SMOOTHED) { - parCSR *AP = SpMM(level->A, P); + parCSR AP = SpMM(A, P); Acoarse = SpMM(R, AP); - delete AP; } else { - Acoarse = galerkinProd(level->A, P); //specialize for unsmoothed aggregation + Acoarse = galerkinProd(A, P); //specialize for unsmoothed aggregation } - Acoarse->diagSetup(); + Acoarse.diagSetup(); - amgLevel *coarseLevel = new amgLevel(Acoarse,level->settings); + amgLevel coarseLevel(Acoarse,level.settings); //update the number of columns required for this level - level->Ncols = (level->Ncols > R->Ncols) ? level->Ncols : R->Ncols; + level.Ncols = std::max(level.Ncols, std::max(A.Ncols, R.Ncols)); // coarseLevel->Ncols = (coarseLevel->Ncols > P->Ncols) ? coarseLevel->Ncols : P->Ncols; - free(FineToCoarse); - free(globalAggStarts); - return coarseLevel; } } //namespace parAlmond + +} //namespace libp diff --git a/libs/parAlmond/parAlmondFormAggregates.cpp b/libs/parAlmond/parAlmondFormAggregates.cpp index b42ec5d95..0fea3d499 100644 --- a/libs/parAlmond/parAlmondFormAggregates.cpp +++ b/libs/parAlmond/parAlmondFormAggregates.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,9 +27,12 @@ SOFTWARE. #include "parAlmond.hpp" #include "parAlmond/parAlmondAMGSetup.hpp" +namespace libp { + namespace parAlmond { -static bool customLess(int smax, dfloat rmax, hlong imax, int s, dfloat r, hlong i){ +static bool customLess(const int smax, const dfloat rmax, const hlong imax, + const int s, const dfloat r, const hlong i){ if(s > smax) return true; if(smax > s) return false; @@ -49,46 +52,43 @@ static bool customLess(int smax, dfloat rmax, hlong imax, int s, dfloat r, hlong // /*****************************************************************************/ -void formAggregates(parCSR *A, strongGraph_t *C, - hlong* FineToCoarse, - hlong* globalAggStarts){ +void formAggregates(parCSR& A, strongGraph_t& C, + memory FineToCoarse, + memory globalAggStarts){ - int rank, size; - MPI_Comm_rank(A->comm, &rank); - MPI_Comm_size(A->comm, &size); + int rank = A.comm.rank(); + int size = A.comm.size(); - const dlong N = C->Nrows; - const dlong M = C->Ncols; - const dlong nnz = C->nnz; + const dlong N = C.Nrows; + const dlong M = C.Ncols; + const dlong nnz = C.nnz; - dfloat *rands = (dfloat *) calloc(M, sizeof(dfloat)); - int *states = (int *) calloc(M, sizeof(int)); - hlong *colMap = A->colMap; //mapping from local column ids to global ids + memory rands(M); + memory states(M, 0); + memory colMap = A.colMap; //mapping from local column ids to global ids - dfloat *Tr = (dfloat *) calloc(M, sizeof(dfloat)); - int *Ts = (int *) calloc(M, sizeof(int)); - hlong *Ti = (hlong *) calloc(M, sizeof(hlong)); - hlong *Tc = (hlong *) calloc(M, sizeof(hlong)); + memory Tr(M); + memory Ts(M); + memory Ti(M); + memory Tc(M); for(dlong i=0; i colCnt(M, 0); for(dlong i=0; icols[i]]++; + colCnt[C.cols[i]]++; //gs for total column counts - A->halo->Combine(colCnt, 1, ogs_int); + A.halo.Combine(colCnt, 1); //add random pertubation for(int i=0;ihalo->Exchange(rands, 1, ogs_dfloat); + A.halo.Exchange(rands, 1); hlong done = 0; while(!done){ @@ -100,8 +100,8 @@ void formAggregates(parCSR *A, strongGraph_t *C, hlong imax = colMap[i]; if(smax != 1){ - for(dlong jj=C->rowStarts[i];jjrowStarts[i+1];jj++){ - const dlong col = C->cols[jj]; + for(dlong jj=C.rowStarts[i];jjhalo->Exchange(Tr, 1, ogs_dfloat); - A->halo->Exchange(Ts, 1, ogs_int); - A->halo->Exchange(Ti, 1, ogs_hlong); + A.halo.Exchange(Tr, 1); + A.halo.Exchange(Ts, 1); + A.halo.Exchange(Ti, 1); // second neighbours // #pragma omp parallel for @@ -127,8 +127,8 @@ void formAggregates(parCSR *A, strongGraph_t *C, dfloat rmax = Tr[i]; hlong imax = Ti[i]; - for(dlong jj=C->rowStarts[i];jjrowStarts[i+1];jj++){ - const dlong col = C->cols[jj]; + for(dlong jj=C.rowStarts[i];jjhalo->Exchange(states, 1, ogs_int); + A.halo.Exchange(states, 1); // if number of undecided nodes = 0, algorithm terminates - hlong cnt = 0; - for (dlong n=0;ncomm); + A.comm.Allreduce(done, Comm::Sum); done = (done == 0) ? 1 : 0; } dlong numAggs = 0; - dlong *gNumAggs = (dlong *) calloc(size,sizeof(dlong)); + memory gNumAggs(size); // count the coarse nodes/aggregates for(dlong i=0; icomm); + A.comm.Allgather(numAggs, gNumAggs); globalAggStarts[0] = 0; for (int r=0;rhalo->Exchange(FineToCoarse, 1, ogs_hlong); + A.halo.Exchange(FineToCoarse, 1); // form the aggregates // #pragma omp parallel for @@ -195,8 +192,8 @@ void formAggregates(parCSR *A, strongGraph_t *C, hlong cmax = FineToCoarse[i]; if(smax != 1){ - for(dlong jj=C->rowStarts[i];jjrowStarts[i+1];jj++){ - const dlong col = C->cols[jj]; + for(dlong jj=C.rowStarts[i];jjhalo->Exchange(FineToCoarse, 1, ogs_hlong); - A->halo->Exchange(Tr, 1, ogs_dfloat); - A->halo->Exchange(Ts, 1, ogs_int); - A->halo->Exchange(Ti, 1, ogs_hlong); - A->halo->Exchange(Tc, 1, ogs_hlong); + A.halo.Exchange(FineToCoarse, 1); + A.halo.Exchange(Tr, 1); + A.halo.Exchange(Ts, 1); + A.halo.Exchange(Ti, 1); + A.halo.Exchange(Tc, 1); // second neighbours // #pragma omp parallel for @@ -230,8 +227,8 @@ void formAggregates(parCSR *A, strongGraph_t *C, hlong imax = Ti[i]; hlong cmax = Tc[i]; - for(dlong jj=C->rowStarts[i];jjrowStarts[i+1];jj++){ - const dlong col = C->cols[jj]; + for(dlong jj=C.rowStarts[i];jjhalo->Exchange(FineToCoarse, 1, ogs_hlong); - - free(rands); - free(states); - free(Tr); - free(Ts); - free(Ti); - free(Tc); + A.halo.Exchange(FineToCoarse, 1); } } //namespace parAlmond + +} //namespace libp diff --git a/libs/parAlmond/parAlmondGalerkinProd.cpp b/libs/parAlmond/parAlmondGalerkinProd.cpp index 03429d38b..2d21207ed 100644 --- a/libs/parAlmond/parAlmondGalerkinProd.cpp +++ b/libs/parAlmond/parAlmondGalerkinProd.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,16 +27,17 @@ SOFTWARE. #include "parAlmond.hpp" #include "parAlmond/parAlmondAMGSetup.hpp" +namespace libp { + namespace parAlmond { -parCSR *galerkinProd(parCSR *A, parCSR *P){ +parCSR galerkinProd(parCSR& A, parCSR& P){ // MPI info - int rank, size; - MPI_Comm_rank(A->comm, &rank); - MPI_Comm_size(A->comm, &size); + int rank = A.comm.rank(); + int size = A.comm.size(); - hlong *globalAggStarts = P->globalColStarts; + memory globalAggStarts = P.globalColStarts; hlong globalAggOffset = globalAggStarts[rank]; //The galerkin product can be computed as @@ -44,47 +45,47 @@ parCSR *galerkinProd(parCSR *A, parCSR *P){ // Since each row of P has only one entry, we can share the necessary // P entries, form the products, and send them to their destination rank - const dlong N = A->Nrows; - const dlong M = A->Ncols; + const dlong N = A.Nrows; + const dlong M = A.Ncols; //printf("Level has %d rows, and is making %d aggregates\n", N, globalAggStarts[rank+1]-globalAggStarts[rank]); // Exploit the fact that we know P has one non-zero per row to // compress the global Ids of the columns and nonzero values to // single vectors - hlong *Pcols = (hlong *) calloc(M,sizeof(hlong)); - pfloat *Pvals = (pfloat *) calloc(M,sizeof(pfloat)); + memory Pcols(M); + memory Pvals(M); //record the entries of P that this rank has for (dlong i=0;idiag.rowStarts[i];jdiag.rowStarts[i+1];j++) { - Pcols[i] = P->diag.cols[j] + globalAggOffset; //global ID - Pvals[i] = P->diag.vals[j]; + for (dlong j=P.diag.rowStarts[i];joffd.nzRows;i++) { - const dlong row = P->offd.rows[i]; - for (dlong j=P->offd.mRowStarts[i];joffd.mRowStarts[i+1];j++) { - Pcols[row] = P->colMap[P->offd.cols[j]]; //global ID - Pvals[row] = P->offd.vals[j]; + for (dlong i=0;ihalo->Exchange(Pcols, 1, ogs_hlong); - A->halo->Exchange(Pvals, 1, ogs_pfloat); + A.halo.Exchange(Pcols, 1); + A.halo.Exchange(Pvals, 1); - dlong sendNtotal = A->diag.nnz+A->offd.nnz; - parCOO::nonZero_t *sendPTAP = (parCOO::nonZero_t *) calloc(sendNtotal,sizeof(parCOO::nonZero_t)); + dlong sendNtotal = A.diag.nnz+A.offd.nnz; + memory sendPTAP(sendNtotal); //form the fine PTAP products dlong cnt =0; for (dlong i=0;idiag.rowStarts[i]; - const dlong end = A->diag.rowStarts[i+1]; + const dlong start = A.diag.rowStarts[i]; + const dlong end = A.diag.rowStarts[i+1]; for (dlong j=start;jdiag.cols[j]; - const dfloat val = A->diag.vals[j]; + const dlong col = A.diag.cols[j]; + const dfloat val = A.diag.vals[j]; sendPTAP[cnt].row = Pcols[i]; sendPTAP[cnt].col = Pcols[col]; @@ -92,13 +93,13 @@ parCSR *galerkinProd(parCSR *A, parCSR *P){ cnt++; } } - for (dlong i=0;ioffd.nzRows;i++) { - const dlong row = A->offd.rows[i]; - const dlong start = A->offd.mRowStarts[i]; - const dlong end = A->offd.mRowStarts[i+1]; + for (dlong i=0;ioffd.cols[j]; - const dfloat val = A->offd.vals[j]; + const dlong col = A.offd.cols[j]; + const dfloat val = A.offd.vals[j]; sendPTAP[cnt].row = Pcols[row]; sendPTAP[cnt].col = Pcols[col]; @@ -107,11 +108,8 @@ parCSR *galerkinProd(parCSR *A, parCSR *P){ } } - free(Pcols); - free(Pvals); - //sort entries by the coarse row and col - std::sort(sendPTAP, sendPTAP+sendNtotal, + std::sort(sendPTAP.ptr(), sendPTAP.ptr()+sendNtotal, [](const parCOO::nonZero_t& a, const parCOO::nonZero_t& b) { if (a.row < b.row) return true; if (a.row > b.row) return false; @@ -120,10 +118,10 @@ parCSR *galerkinProd(parCSR *A, parCSR *P){ }); //count number of non-zeros we're sending - int *sendCounts = (int *) calloc(size,sizeof(int)); - int *recvCounts = (int *) calloc(size,sizeof(int)); - int *sendOffsets = (int *) calloc(size+1,sizeof(int)); - int *recvOffsets = (int *) calloc(size+1,sizeof(int)); + memory sendCounts(size,0); + memory recvCounts(size); + memory sendOffsets(size+1); + memory recvOffsets(size+1); int r=0; for(dlong i=0;icomm); + A.comm.Alltoall(sendCounts, recvCounts); // find send and recv offsets for gather + sendOffsets[0] = 0; + recvOffsets[0] = 0; for(int rr=0;rrcomm); + memory recvPTAP(recvNtotal); - //clean up - MPI_Barrier(A->comm); - free(sendPTAP); - free(sendCounts); free(recvCounts); - free(sendOffsets); free(recvOffsets); + A.comm.Alltoallv(sendPTAP, sendCounts, sendOffsets, + recvPTAP, recvCounts, recvOffsets); //sort entries by the coarse row and col - std::sort(recvPTAP, recvPTAP+recvNtotal, + std::sort(recvPTAP.ptr(), recvPTAP.ptr()+recvNtotal, [](const parCOO::nonZero_t& a, const parCOO::nonZero_t& b) { if (a.row < b.row) return true; if (a.row > b.row) return false; @@ -172,16 +164,14 @@ parCSR *galerkinProd(parCSR *A, parCSR *P){ (recvPTAP[i].col!=recvPTAP[i-1].col)) nnz++; - parCOO PTAP(A->platform, A->comm); + parCOO PTAP(A.platform, A.comm); //copy global partition - PTAP.globalRowStarts = (hlong *) calloc(size+1,sizeof(hlong)); - PTAP.globalColStarts = (hlong *) calloc(size+1,sizeof(hlong)); - memcpy(PTAP.globalRowStarts, globalAggStarts, (size+1)*sizeof(hlong)); - memcpy(PTAP.globalColStarts, globalAggStarts, (size+1)*sizeof(hlong)); + PTAP.globalRowStarts = globalAggStarts; + PTAP.globalColStarts = globalAggStarts; PTAP.nnz = nnz; - PTAP.entries = (parCOO::nonZero_t *) malloc(PTAP.nnz*sizeof(parCOO::nonZero_t)); + PTAP.entries.malloc(PTAP.nnz); //compress nonzeros nnz = 0; @@ -195,12 +185,10 @@ parCSR *galerkinProd(parCSR *A, parCSR *P){ } } - //clean up - MPI_Barrier(A->comm); - free(recvPTAP); - //build Ac from coo matrix - return new parCSR(PTAP); + return parCSR(PTAP); } } //namespace parAlmond + +} //namespace libp diff --git a/libs/parAlmond/parAlmondKcycle.cpp b/libs/parAlmond/parAlmondKcycle.cpp index 4b5bf5d6f..4f5eb4d20 100644 --- a/libs/parAlmond/parAlmondKcycle.cpp +++ b/libs/parAlmond/parAlmondKcycle.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,11 +26,13 @@ SOFTWARE. #include "parAlmond.hpp" #include "parAlmond/parAlmondKernels.hpp" -#include "parAlmond/parAlmondMultigrid.hpp" +#include "parAlmond/parAlmondCoarseSolver.hpp" + +namespace libp { namespace parAlmond { -void multigrid_t::kcycle(const int k, occa::memory& o_RHS, occa::memory& o_X){ +void multigrid_t::kcycle(const int k, deviceMemory& o_RHS, deviceMemory& o_X){ //check for base level if(k==baseLevel) { @@ -38,21 +40,21 @@ void multigrid_t::kcycle(const int k, occa::memory& o_RHS, occa::memory& o_X){ return; } - multigridLevel *level = levels[k]; - multigridLevel *levelC = levels[k+1]; - occa::memory& o_RHSC = o_rhs[k+1]; - occa::memory& o_XC = o_x[k+1]; - occa::memory& o_RES = o_scratch; + multigridLevel& level = *levels[k]; + multigridLevel& levelC = *levels[k+1]; + deviceMemory& o_RHSC = o_rhs[k+1]; + deviceMemory& o_XC = o_x[k+1]; + deviceMemory& o_RES = o_scratch; - const dlong mCoarse = levelC->Nrows; + const dlong mCoarse = levelC.Nrows; //apply smoother to x and then compute res = rhs-Ax - level->smooth(o_RHS, o_X, true); + level.smooth(o_RHS, o_X, true); - level->residual(o_RHS, o_X, o_RES); + level.residual(o_RHS, o_X, o_RES); // rhsC = P^T res - level->coarsen(o_RES, o_RHSC); + level.coarsen(o_RES, o_RHSC); if(k+1>NUMKCYCLES) { vcycle(k+1, o_RHSC, o_XC); @@ -60,9 +62,9 @@ void multigrid_t::kcycle(const int k, occa::memory& o_RHS, occa::memory& o_X){ // first inner krylov iteration kcycle(k+1, o_RHSC, o_XC); - occa::memory& o_CK = o_ck[k+1]; - occa::memory& o_VK = o_vk[k+1]; - occa::memory& o_WK = o_wk[k+1]; + deviceMemory& o_CK = o_ck[k+1]; + deviceMemory& o_VK = o_vk[k+1]; + deviceMemory& o_WK = o_wk[k+1]; // ck = xC, vk = A*ck // alpha1=ck*rhsC, rho1=ck*Ack, norm_rhs=sqrt(rhsC*rhsC) @@ -70,11 +72,11 @@ void multigrid_t::kcycle(const int k, occa::memory& o_RHS, occa::memory& o_X){ // norm_rtilde = sqrt(rhsC*rhsC) dfloat rho1, alpha1, norm_rhs, norm_rhstilde; kcycleOp1(levelC, o_XC, o_RHSC, o_CK, o_VK, - &alpha1, &rho1, &norm_rhs, &norm_rhstilde); + alpha1, rho1, norm_rhs, norm_rhstilde); if(norm_rhstilde < KCYCLETOL*norm_rhs){ // xC = (alpha1/rho1)*xC - platform.linAlg.scale(mCoarse, alpha1/rho1, o_XC); + platform.linAlg().scale(mCoarse, alpha1/rho1, o_XC); } else{ // second inner krylov iteration @@ -89,135 +91,159 @@ void multigrid_t::kcycle(const int k, occa::memory& o_RHS, occa::memory& o_X){ } // x = x + P xC - level->prolongate(o_XC, o_X); + level.prolongate(o_XC, o_X); - level->smooth(o_RHS, o_X, false); + level.smooth(o_RHS, o_X, false); } -void multigrid_t::kcycleOp1(multigridLevel* level, - occa::memory& o_X, occa::memory& o_RHS, - occa::memory& o_CK, occa::memory& o_VK, - dfloat *alpha1, dfloat *rho1, - dfloat *norm_rhs, dfloat *norm_rhstilde) { +void multigrid_t::kcycleOp1(multigridLevel& level, + deviceMemory& o_X, deviceMemory& o_RHS, + deviceMemory& o_CK, deviceMemory& o_VK, + dfloat& alpha1, dfloat& rho1, + dfloat& norm_rhs, dfloat& norm_rhstilde) { //ck = x - platform.linAlg.axpy(level->Nrows, 1.0, o_X, 0.0, o_CK); + platform.linAlg().axpy(level.Nrows, 1.0, o_X, 0.0, o_CK); // vk = A*ck - level->Operator(o_CK,o_VK); + level.Operator(o_CK,o_VK); // alpha1=ck*rhsC, rho1=ck*Ack, norm_rhs=sqrt(rhsC*rhsC) - dfloat rho[3]; if(ktype == PCG) - kcycleCombinedOp1(level, rho, o_CK, o_RHS, o_VK); + kcycleCombinedOp1(level, o_CK, o_RHS, o_VK, alpha1, rho1, norm_rhs); if(ktype == GMRES) - kcycleCombinedOp1(level, rho, o_VK, o_RHS, o_VK); + kcycleCombinedOp1(level, o_VK, o_RHS, o_VK, alpha1, rho1, norm_rhs); - *alpha1 = rho[0]; - *rho1 = rho[1]; - *norm_rhs = sqrt(rho[2]); + norm_rhs = sqrt(norm_rhs); // rhs = rhs - (alpha1/rho1)*vk - const dfloat a = -(*alpha1)/(*rho1); - *norm_rhstilde = sqrt(vectorAddInnerProd(level, a, o_VK, 1.0, o_RHS)); + const dfloat a = -(alpha1)/(rho1); + norm_rhstilde = sqrt(vectorAddInnerProd(level, a, o_VK, 1.0, o_RHS)); } -void multigrid_t::kcycleOp2(multigridLevel* level, - occa::memory& o_X, occa::memory& o_RHS, - occa::memory& o_CK, occa::memory& o_VK, occa::memory& o_WK, +void multigrid_t::kcycleOp2(multigridLevel& level, + deviceMemory& o_X, deviceMemory& o_RHS, + deviceMemory& o_CK, deviceMemory& o_VK, deviceMemory& o_WK, const dfloat alpha1, const dfloat rho1) { - if(fabs(rho1) > (dfloat) 1e-20){ + if(std::abs(rho1) > (dfloat) 1e-20){ // wk = A*x - level->Operator(o_X,o_WK); + level.Operator(o_X,o_WK); // gamma=xC*Ack, beta=xC*AxC, alpha2=xC*rhsC - dfloat rho[3]; + dfloat gamma, beta, alpha2; if(ktype == PCG) - kcycleCombinedOp2(level, rho, o_X, o_VK, o_WK, o_RHS); + kcycleCombinedOp2(level, o_X, o_VK, o_WK, o_RHS, gamma, beta, alpha2); if(ktype == GMRES) - kcycleCombinedOp2(level, rho, o_WK, o_VK, o_WK, o_RHS); - - const dfloat gamma = rho[0]; - const dfloat beta = rho[1]; - const dfloat alpha2 = rho[2]; - + kcycleCombinedOp2(level, o_WK, o_VK, o_WK, o_RHS, gamma, beta, alpha2); const dfloat rho2 = beta - gamma*gamma/rho1; - if(fabs(rho2) > (dfloat) 1e-20){ + if(std::abs(rho2) > (dfloat) 1e-20){ // x = (alpha1/rho1 - (gam*alpha2)/(rho1*rho2))*ck + (alpha2/rho2)*x const dfloat a = alpha1/rho1 - gamma*alpha2/(rho1*rho2); const dfloat b = alpha2/rho2; - platform.linAlg.axpy(level->Nrows, a, o_CK, b, o_X); + platform.linAlg().axpy(level.Nrows, a, o_CK, b, o_X); } } } // returns aDotbc[0] = a\dot b, aDotbc[1] = a\dot c, aDotbc[2] = b\dot b, -void multigrid_t::kcycleCombinedOp1(multigridLevel* level, - dfloat *aDotbc, occa::memory& o_a, - occa::memory& o_b, occa::memory& o_c) { +void multigrid_t::kcycleCombinedOp1(multigridLevel& level, + deviceMemory& o_a, + deviceMemory& o_b, + deviceMemory& o_c, + dfloat& aDotb, + dfloat& aDotc, + dfloat& bDotb) { - const dlong N = level->Nrows; - dfloat result[3] = {0.,0.,0.}; - dlong numBlocks = (N < PARALMOND_NBLOCKS) ? N : PARALMOND_NBLOCKS; + const dlong N = level.Nrows; + dlong numBlocks = std::min(N, PARALMOND_NBLOCKS); kcycleCombinedOp1Kernel(numBlocks,N,o_a,o_b,o_c,o_reductionScratch); - o_reductionScratch.copyTo(reductionScratch,3*numBlocks*sizeof(dfloat),0); - for(dlong i=0; i0) { + reductionScratch.copyFrom(o_reductionScratch,3*numBlocks); + } else { + reductionScratch[0] = 0.0; + reductionScratch[1] = 0.0; + reductionScratch[2] = 0.0; } - MPI_Allreduce(result,aDotbc,3,MPI_DFLOAT,MPI_SUM,comm); + + for(dlong i=1; iNrows; - dfloat result[3] = {0.,0.,0.}; - dlong numBlocks = (N < PARALMOND_NBLOCKS) ? N : PARALMOND_NBLOCKS; +void multigrid_t::kcycleCombinedOp2(multigridLevel& level, + deviceMemory& o_a, + deviceMemory& o_b, + deviceMemory& o_c, + deviceMemory& o_d, + dfloat& aDotb, + dfloat& aDotc, + dfloat& aDotd) { + + const dlong N = level.Nrows; + dlong numBlocks = std::min(N, PARALMOND_NBLOCKS); kcycleCombinedOp2Kernel(numBlocks,N,o_a,o_b,o_c,o_d,o_reductionScratch); - o_reductionScratch.copyTo(reductionScratch,3*numBlocks*sizeof(dfloat),0); - for(dlong i=0; i0) { + reductionScratch.copyFrom(o_reductionScratch,3*numBlocks); + } else { + reductionScratch[0] = 0.0; + reductionScratch[1] = 0.0; + reductionScratch[2] = 0.0; } - MPI_Allreduce(result,aDotbcd,3,MPI_DFLOAT,MPI_SUM,comm); + + for(dlong i=1; i& o_X, + const dfloat beta, deviceMemory& o_Y){ - const dlong N = level->Nrows; - dfloat result = 0.; - dfloat gresult = 0.; - dlong numBlocks = (N < PARALMOND_NBLOCKS) ? N : PARALMOND_NBLOCKS; + const dlong N = level.Nrows; + dlong numBlocks = std::min(N, PARALMOND_NBLOCKS); vectorAddInnerProdKernel(numBlocks,N,alpha,beta,o_X,o_Y,o_reductionScratch); - o_reductionScratch.copyTo(reductionScratch,numBlocks*sizeof(dfloat),0); + + if (numBlocks>0) { + reductionScratch.copyFrom(o_reductionScratch,numBlocks); + } else { + reductionScratch[0] = 0.0; + } // #pragma omp parallel for reduction(+:result) - for (dlong i=0; i& o_RHS, deviceMemory& o_X) { if (ctype == KCYCLE) { kcycle(0, o_RHS, o_X); } else { @@ -40,7 +41,7 @@ void multigrid_t::Operator(occa::memory& o_RHS, occa::memory& o_X) { } multigrid_t::multigrid_t(platform_t& _platform, settings_t& _settings, - MPI_Comm _comm): + comm_t _comm): platform(_platform), settings(_settings), comm(_comm) { //determine what sort of multigrid cycle to construct @@ -80,70 +81,64 @@ multigrid_t::multigrid_t(platform_t& _platform, settings_t& _settings, coarsetype=COARSEEXACT; if (coarsetype==COARSEEXACT) { - coarseSolver = new exactSolver_t(_platform, _settings, _comm); + coarseSolver = std::make_shared(_platform, _settings, _comm); } else { - coarseSolver = new oasSolver_t(_platform, _settings, _comm); + coarseSolver = std::make_shared(_platform, _settings, _comm); } } -multigrid_t::~multigrid_t() { - if (linearSolver) delete linearSolver; - if (coarseSolver) delete coarseSolver; - for (int n=0;nNrows, level->Ncols - level->Nrows, - platform, settings, comm); + linearSolver.Setup(level.Nrows, + level.Ncols - level.Nrows, + platform, settings, comm); else - linearSolver = new pcg(level->Nrows, level->Ncols - level->Nrows, - platform, settings, comm); + linearSolver.Setup(level.Nrows, + level.Ncols - level.Nrows, + platform, settings, comm); } if (ctype==KCYCLE) { //first level - if (reductionScratchBytes==0) { - reductionScratchBytes = 3*PARALMOND_NBLOCKS*sizeof(dfloat); - dfloat *dummy = (dfloat *) calloc(3*PARALMOND_NBLOCKS,sizeof(dfloat)); - o_reductionScratch = platform.malloc(reductionScratchBytes, dummy); - reductionScratch = platform.hostMalloc(reductionScratchBytes, NULL, h_reductionScratch); - free(dummy); + if (NreductionScratch==0) { + NreductionScratch = 3*PARALMOND_NBLOCKS; + memory dummy(3*PARALMOND_NBLOCKS, 0.0); + reductionScratch = platform.hostMalloc(NreductionScratch, dummy); + o_reductionScratch = platform.malloc(NreductionScratch, dummy); } //extra stroage for kcycle vectors - if (numLevels>0 && numLevelsNcols,sizeof(dfloat)); - o_ck[numLevels] = platform.malloc(level->Ncols*sizeof(dfloat),dummy); - o_vk[numLevels] = platform.malloc(level->Nrows*sizeof(dfloat),dummy); - o_wk[numLevels] = platform.malloc(level->Nrows*sizeof(dfloat),dummy); - free(dummy); + if (k>0 && k dummy(level.Ncols,0.0); + o_ck[k] = platform.malloc(level.Ncols,dummy); + o_vk[k] = platform.malloc(level.Nrows,dummy); + o_wk[k] = platform.malloc(level.Nrows,dummy); } } //allocate space for coarse rhs and x - if (numLevels>0) { - dfloat *dummy = (dfloat *) calloc(level->Ncols,sizeof(dfloat)); - o_x[numLevels] = platform.malloc(level->Ncols*sizeof(dfloat),dummy); - o_rhs[numLevels] = platform.malloc(level->Ncols*sizeof(dfloat),dummy); - free(dummy); + if (k>0) { + memory dummy(level.Ncols,0.0); + o_x[k] = platform.malloc(level.Ncols,dummy); + o_rhs[k] = platform.malloc(level.Ncols,dummy); } //scratch space includes space for residual and 2 vectors used in Chebyshev smoothing - size_t requiredBytes = 2*level->Ncols*sizeof(dfloat); - if (requiredBytes>scratchSpaceBytes) { - scratchSpaceBytes = requiredBytes; - dfloat *dummy = (dfloat *) calloc(2*level->Ncols,sizeof(dfloat)); - o_scratch = platform.malloc(requiredBytes, dummy); - free(dummy); + size_t Nrequired = 2*level.Ncols; + if (Nrequired>NscratchSpace) { + NscratchSpace = Nrequired; + memory dummy(2*level.Ncols,0.0); + o_scratch = platform.malloc(Nrequired, dummy); } - level->o_scratch = o_scratch; - - levels[numLevels++] = level; + level.o_scratch = o_scratch; } } //namespace parAlmond + +} //namespace libp diff --git a/libs/parAlmond/parAlmondSettings.cpp b/libs/parAlmond/parAlmondSettings.cpp index d95349cda..e8dda1d63 100644 --- a/libs/parAlmond/parAlmondSettings.cpp +++ b/libs/parAlmond/parAlmondSettings.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,10 +26,12 @@ SOFTWARE. #include "parAlmond.hpp" +namespace libp { + namespace parAlmond { void AddSettings(settings_t& settings, - const string prefix) { + const std::string prefix) { settings.newSetting(prefix+"PARALMOND CYCLE", "VCYCLE", @@ -67,4 +69,6 @@ void ReportSettings(settings_t& settings) { settings.reportSetting("PARALMOND CHEBYSHEV DEGREE"); } -} //namespace parAlmond \ No newline at end of file +} //namespace parAlmond + +} //namespace libp diff --git a/libs/parAlmond/parAlmondSmoothPrologator.cpp b/libs/parAlmond/parAlmondSmoothPrologator.cpp index 80263560a..ee8563563 100644 --- a/libs/parAlmond/parAlmondSmoothPrologator.cpp +++ b/libs/parAlmond/parAlmondSmoothPrologator.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,14 +27,15 @@ SOFTWARE. #include "parAlmond.hpp" #include "parAlmond/parAlmondAMGSetup.hpp" +namespace libp { + namespace parAlmond { -parCSR *smoothProlongator(parCSR *A, parCSR *T){ +parCSR smoothProlongator(parCSR& A, parCSR& T){ // MPI info - int rank, size; - MPI_Comm_rank(A->comm, &rank); - MPI_Comm_size(A->comm, &size); + int rank = A.comm.rank(); + int size = A.comm.size(); // This function computes a smoothed prologation operator // via a single weighted Jacobi iteration on the tentative @@ -44,81 +45,82 @@ parCSR *smoothProlongator(parCSR *A, parCSR *T){ // // To compute D^{-1}*A*T we need all the rows T(j,:) for which // j is a column index for the nonzeros of A on this rank. - // For all local column indices in A->diag, we will already + // For all local column indices in A.diag, we will already // have the row of T on this rank, so we just need to gather // the offd colIds //Jacobi weight - const dfloat omega = (4./3.)/A->rho; + const dfloat omega = (4./3.)/A.rho; - hlong *recvRows = (hlong *) calloc(A->Ncols-A->NlocalCols, sizeof(hlong)); - int *sendCounts = (int*) calloc(size, sizeof(int)); - int *recvCounts = (int*) calloc(size, sizeof(int)); - int *sendOffsets = (int*) calloc(size+1, sizeof(int)); - int *recvOffsets = (int*) calloc(size+1, sizeof(int)); + memory recvRows(A.Ncols-A.NlocalCols); + memory sendCounts(size); + memory recvCounts(size, 0); + memory sendOffsets(size+1); + memory recvOffsets(size+1); //use the colMap of A to list the needed rows of T int r=0; - for (dlong n=A->NlocalCols;nNcols;n++) { - const hlong id = A->colMap[n]; - while (id>=T->globalRowStarts[r+1]) r++; //assumes the halo is sorted + for (dlong n=A.NlocalCols;n=T.globalRowStarts[r+1]) r++; //assumes the halo is sorted recvCounts[r]++; - recvRows[n-A->NlocalCols] = id; //record the row to recv + recvRows[n-A.NlocalCols] = id; //record the row to recv } //share the counts - MPI_Alltoall(recvCounts, 1, MPI_INT, - sendCounts, 1, MPI_INT, A->comm); + A.comm.Alltoall(recvCounts, sendCounts); + sendOffsets[0] = 0; + recvOffsets[0] = 0; for (r=0;r sendRows(sendTotal); //share the rowIds - MPI_Alltoallv(recvRows, recvCounts, recvOffsets, MPI_HLONG, - sendRows, sendCounts, sendOffsets, MPI_HLONG, - T->comm); + T.comm.Alltoallv(recvRows, recvCounts, recvOffsets, + sendRows, sendCounts, sendOffsets); //we now have a list of rows to send, count the nnz to send dlong nnzTotal=0; for (r=0;rglobalRowStarts[rank]); //local row id - sendCounts[r]+= T->diag.rowStarts[i+1]-T->diag.rowStarts[i]; //count entries in this row - sendCounts[r]+= T->offd.rowStarts[i+1]-T->offd.rowStarts[i]; //count entries in this row + dlong i = (dlong) (sendRows[n]-T.globalRowStarts[rank]); //local row id + sendCounts[r]+= T.diag.rowStarts[i+1]-T.diag.rowStarts[i]; //count entries in this row + sendCounts[r]+= T.offd.rowStarts[i+1]-T.offd.rowStarts[i]; //count entries in this row } nnzTotal += sendCounts[r]; //tally the total } - parCOO::nonZero_t *sendNonZeros = (parCOO::nonZero_t *) calloc(nnzTotal, sizeof(parCOO::nonZero_t)); + memory sendNonZeros(nnzTotal); nnzTotal=0; //reset for (r=0;rglobalRowStarts[rank]); //local row id - for (dlong jj=T->diag.rowStarts[i]; jjdiag.rowStarts[i+1];jj++){ + dlong i = (dlong) (sendRows[n] - T.globalRowStarts[rank]); //local row id + for (dlong jj=T.diag.rowStarts[i]; jjdiag.cols[jj] + T->globalColStarts[rank]; - sendNonZeros[nnzTotal].val = T->diag.vals[jj]; + sendNonZeros[nnzTotal].col = T.diag.cols[jj] + T.globalColStarts[rank]; + sendNonZeros[nnzTotal].val = T.diag.vals[jj]; nnzTotal++; } - for (dlong jj=T->offd.rowStarts[i]; jjoffd.rowStarts[i+1];jj++){ + for (dlong jj=T.offd.rowStarts[i]; jjcolMap[T->offd.cols[jj]]; - sendNonZeros[nnzTotal].val = T->offd.vals[jj]; + sendNonZeros[nnzTotal].col = T.colMap[T.offd.cols[jj]]; + sendNonZeros[nnzTotal].val = T.offd.vals[jj]; nnzTotal++; } } } - MPI_Alltoall(sendCounts, 1, MPI_INT, - recvCounts, 1, MPI_INT, A->comm); + A.comm.Alltoall(sendCounts, recvCounts); + sendOffsets[0] = 0; + recvOffsets[0] = 0; for (r=0;rcomm); + memory ToffdRows(Toffdnnz); - //clean up - MPI_Barrier(T->comm); - free(sendNonZeros); - free(sendCounts); - free(recvCounts); - free(sendOffsets); - free(recvOffsets); + T.comm.Alltoallv(sendNonZeros, sendCounts, sendOffsets, + ToffdRows, recvCounts, recvOffsets); //we now have all the needed nonlocal rows (should also be sorted by row then col) //make an array of row offsets so we know how large each row is - dlong *ToffdRowOffsets = (dlong *) calloc(A->Ncols-A->NlocalCols+1, sizeof(dlong)); + memory ToffdRowOffsets(A.Ncols-A.NlocalCols+1, 0); dlong id=0; for (dlong n=0;ncolMap[id+A->NlocalCols]!=row) id++; + while(A.colMap[id+A.NlocalCols]!=row) id++; ToffdRowOffsets[id+1]++; //count entry in row } //cumulative sum - for (dlong n=0;nNcols-A->NlocalCols;n++) + for (dlong n=0;ndiag.nnz+T->offd.nnz; //start with T populated + nnzTotal = T.diag.nnz+T.offd.nnz; //start with T populated - for (dlong i=0;iNrows;i++) { + for (dlong i=0;idiag.rowStarts[i]; - dlong end = A->diag.rowStarts[i+1]; + dlong start = A.diag.rowStarts[i]; + dlong end = A.diag.rowStarts[i+1]; for (dlong j=start;jdiag.cols[j]; - const int nnzBj = T->diag.rowStarts[col+1]-T->diag.rowStarts[col] - +T->offd.rowStarts[col+1]-T->offd.rowStarts[col]; + const dlong col = A.diag.cols[j]; + const int nnzBj = T.diag.rowStarts[col+1]-T.diag.rowStarts[col] + +T.offd.rowStarts[col+1]-T.offd.rowStarts[col]; nnzTotal += nnzBj; } //non-local entries - start = A->offd.rowStarts[i]; - end = A->offd.rowStarts[i+1]; + start = A.offd.rowStarts[i]; + end = A.offd.rowStarts[i+1]; for (dlong j=start;joffd.cols[j]-A->NlocalCols; + const dlong col = A.offd.cols[j]-A.NlocalCols; const int nnzBj = ToffdRowOffsets[col+1] - ToffdRowOffsets[col]; nnzTotal += nnzBj; } } - parCOO::nonZero_t *Ptmp = (parCOO::nonZero_t *) - calloc(nnzTotal, sizeof(parCOO::nonZero_t)); + memory Ptmp(nnzTotal); // Fill the intermediate form of P dlong cnt = 0; //First P = T - for (dlong i=0;iNrows;i++) { + for (dlong i=0;idiag.rowStarts[i]; - dlong end = T->diag.rowStarts[i+1]; + dlong start = T.diag.rowStarts[i]; + dlong end = T.diag.rowStarts[i+1]; for (dlong j=start;jglobalRowStarts[rank]; - Ptmp[cnt].col = T->diag.cols[j]+T->globalColStarts[rank]; //global id - Ptmp[cnt].val = T->diag.vals[j]; + Ptmp[cnt].row = i + T.globalRowStarts[rank]; + Ptmp[cnt].col = T.diag.cols[j]+T.globalColStarts[rank]; //global id + Ptmp[cnt].val = T.diag.vals[j]; cnt++; } //non-local T entries - start = T->offd.rowStarts[i]; - end = T->offd.rowStarts[i+1]; + start = T.offd.rowStarts[i]; + end = T.offd.rowStarts[i+1]; for (dlong j=start;jglobalRowStarts[rank]; - Ptmp[cnt].col = T->colMap[T->offd.cols[j]]; - Ptmp[cnt].val = T->offd.vals[j]; + Ptmp[cnt].row = i + T.globalRowStarts[rank]; + Ptmp[cnt].col = T.colMap[T.offd.cols[j]]; + Ptmp[cnt].val = T.offd.vals[j]; cnt++; } } //Then P -= omega*invD*A*T - for (dlong i=0;iNrows;i++) { + for (dlong i=0;idiag.rowStarts[i]; - dlong end = A->diag.rowStarts[i+1]; + dlong start = A.diag.rowStarts[i]; + dlong end = A.diag.rowStarts[i+1]; - const dfloat invDi = 1.0/A->diagA[i]; + const dfloat invDi = 1.0/A.diagA[i]; for (dlong j=start;jdiag.cols[j]; - const dfloat Aval = -omega*invDi*A->diag.vals[j]; + const dlong col = A.diag.cols[j]; + const dfloat Aval = -omega*invDi*A.diag.vals[j]; //local T entries - dlong Tstart = T->diag.rowStarts[col]; - dlong Tend = T->diag.rowStarts[col+1]; + dlong Tstart = T.diag.rowStarts[col]; + dlong Tend = T.diag.rowStarts[col+1]; for (dlong jj=Tstart;jjglobalRowStarts[rank]; - Ptmp[cnt].col = T->diag.cols[jj]+T->globalColStarts[rank]; //global id - Ptmp[cnt].val = Aval*T->diag.vals[jj]; + Ptmp[cnt].row = i + A.globalRowStarts[rank]; + Ptmp[cnt].col = T.diag.cols[jj]+T.globalColStarts[rank]; //global id + Ptmp[cnt].val = Aval*T.diag.vals[jj]; cnt++; } //non-local T entries - Tstart = T->offd.rowStarts[col]; - Tend = T->offd.rowStarts[col+1]; + Tstart = T.offd.rowStarts[col]; + Tend = T.offd.rowStarts[col+1]; for (dlong jj=Tstart;jjglobalRowStarts[rank]; - Ptmp[cnt].col = T->colMap[T->offd.cols[jj]]; //global id - Ptmp[cnt].val = Aval*T->offd.vals[jj]; + Ptmp[cnt].row = i + A.globalRowStarts[rank]; + Ptmp[cnt].col = T.colMap[T.offd.cols[jj]]; //global id + Ptmp[cnt].val = Aval*T.offd.vals[jj]; cnt++; } } //non-local A entries - start = A->offd.rowStarts[i]; - end = A->offd.rowStarts[i+1]; + start = A.offd.rowStarts[i]; + end = A.offd.rowStarts[i+1]; for (dlong j=start;joffd.cols[j]-A->NlocalCols; - const dfloat Aval = -omega*invDi*A->offd.vals[j]; + const dlong col = A.offd.cols[j]-A.NlocalCols; + const dfloat Aval = -omega*invDi*A.offd.vals[j]; // entries from recived rows of T dlong Tstart = ToffdRowOffsets[col]; dlong Tend = ToffdRowOffsets[col+1]; for (dlong jj=Tstart;jjglobalRowStarts[rank]; + Ptmp[cnt].row = i + A.globalRowStarts[rank]; Ptmp[cnt].col = ToffdRows[jj].col; //global id Ptmp[cnt].val = Aval*ToffdRows[jj].val; cnt++; } } } - free(ToffdRowOffsets); - free(ToffdRows); - //sort entries by the row and col - std::sort(Ptmp, Ptmp+nnzTotal, + std::sort(Ptmp.ptr(), Ptmp.ptr()+nnzTotal, [](const parCOO::nonZero_t& a, const parCOO::nonZero_t& b) { if (a.row < b.row) return true; if (a.row > b.row) return false; @@ -283,16 +271,14 @@ parCSR *smoothProlongator(parCSR *A, parCSR *T){ if ((Ptmp[i].row!=Ptmp[i-1].row)|| (Ptmp[i].col!=Ptmp[i-1].col)) nnz++; - parCOO cooP(A->platform, A->comm); + parCOO cooP(A.platform, A.comm); //copy global partition - cooP.globalRowStarts = (hlong *) calloc(size+1,sizeof(hlong)); - cooP.globalColStarts = (hlong *) calloc(size+1,sizeof(hlong)); - memcpy(cooP.globalRowStarts, A->globalRowStarts, (size+1)*sizeof(hlong)); - memcpy(cooP.globalColStarts, T->globalColStarts, (size+1)*sizeof(hlong)); + cooP.globalRowStarts = A.globalRowStarts; + cooP.globalColStarts = T.globalColStarts; cooP.nnz = nnz; - cooP.entries = (parCOO::nonZero_t *) calloc(nnz,sizeof(parCOO::nonZero_t)); + cooP.entries.malloc(nnz); //compress nonzeros nnz = 0; @@ -305,11 +291,11 @@ parCSR *smoothProlongator(parCSR *A, parCSR *T){ cooP.entries[nnz-1].val += Ptmp[i].val; } } - //clean up - free(Ptmp); //build P from coo matrix - return new parCSR(cooP); + return parCSR(cooP); } -} //namespace parAlmond \ No newline at end of file +} //namespace parAlmond + +} //namespace libp diff --git a/libs/parAlmond/parAlmondSpMM.cpp b/libs/parAlmond/parAlmondSpMM.cpp index 36d013232..7b67ee1af 100644 --- a/libs/parAlmond/parAlmondSpMM.cpp +++ b/libs/parAlmond/parAlmondSpMM.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,88 +27,88 @@ SOFTWARE. #include "parAlmond.hpp" #include "parAlmond/parAlmondAMGSetup.hpp" +namespace libp { + namespace parAlmond { -parCSR *SpMM(parCSR *A, parCSR *B){ +parCSR SpMM(parCSR& A, parCSR& B){ // MPI info - int rank, size; - MPI_Comm_rank(A->comm, &rank); - MPI_Comm_size(A->comm, &size); + int rank = A.comm.rank(); + int size = A.comm.size(); // To compute C = A*B we need all the rows B(j,:) for which // j is a column index for the nonzeros of A on this rank. - // For all local column indices in A->diag, we will already + // For all local column indices in A.diag, we will already // have the row of B on this rank, so we just need to gather // the offd colIds - hlong *recvRows = (hlong *) calloc(A->Ncols-A->NlocalCols, sizeof(hlong)); - int *sendCounts = (int*) calloc(size, sizeof(int)); - int *recvCounts = (int*) calloc(size, sizeof(int)); - int *sendOffsets = (int*) calloc(size+1, sizeof(int)); - int *recvOffsets = (int*) calloc(size+1, sizeof(int)); + memory recvRows(A.Ncols-A.NlocalCols); + memory sendCounts(size); + memory recvCounts(size, 0); + memory sendOffsets(size+1); + memory recvOffsets(size+1); //use the colMap of A to list the needed rows of B int r=0; - for (dlong n=A->NlocalCols;nNcols;n++) { - const hlong id = A->colMap[n]; - while (id>=B->globalRowStarts[r+1]) r++; //assumes the halo is sorted + for (dlong n=A.NlocalCols;n=B.globalRowStarts[r+1]) r++; //assumes the halo is sorted recvCounts[r]++; - recvRows[n-A->NlocalCols] = id; //record the row to recv + recvRows[n-A.NlocalCols] = id; //record the row to recv } //share the counts - MPI_Alltoall(recvCounts, 1, MPI_INT, - sendCounts, 1, MPI_INT, A->comm); + A.comm.Alltoall(recvCounts, sendCounts); + sendOffsets[0] = 0; + recvOffsets[0] = 0; for (r=0;r sendRows(sendTotal); //share the rowIds - MPI_Alltoallv(recvRows, recvCounts, recvOffsets, MPI_HLONG, - sendRows, sendCounts, sendOffsets, MPI_HLONG, - B->comm); + B.comm.Alltoallv(recvRows, recvCounts, recvOffsets, + sendRows, sendCounts, sendOffsets); //we now have a list of rows to send, count the nnz to send dlong nnzTotal=0; for (r=0;rglobalRowStarts[rank]); //local row id - sendCounts[r]+= B->diag.rowStarts[i+1]-B->diag.rowStarts[i]; //count entries in this row - sendCounts[r]+= B->offd.rowStarts[i+1]-B->offd.rowStarts[i]; //count entries in this row + dlong i = static_cast(sendRows[n]-B.globalRowStarts[rank]); //local row id + sendCounts[r]+= B.diag.rowStarts[i+1]-B.diag.rowStarts[i]; //count entries in this row + sendCounts[r]+= B.offd.rowStarts[i+1]-B.offd.rowStarts[i]; //count entries in this row } nnzTotal += sendCounts[r]; //tally the total } - parCOO::nonZero_t *sendNonZeros = (parCOO::nonZero_t *) calloc(nnzTotal, sizeof(parCOO::nonZero_t)); + memory sendNonZeros(nnzTotal); nnzTotal=0; //reset for (r=0;rglobalRowStarts[rank]); //local row id - for (dlong jj=B->diag.rowStarts[i]; jjdiag.rowStarts[i+1];jj++){ + dlong i = static_cast(sendRows[n] - B.globalRowStarts[rank]); //local row id + for (dlong jj=B.diag.rowStarts[i]; jjdiag.cols[jj] + B->globalColStarts[rank]; - sendNonZeros[nnzTotal].val = B->diag.vals[jj]; + sendNonZeros[nnzTotal].col = B.diag.cols[jj] + B.globalColStarts[rank]; + sendNonZeros[nnzTotal].val = B.diag.vals[jj]; nnzTotal++; } - for (dlong jj=B->offd.rowStarts[i]; jjoffd.rowStarts[i+1];jj++){ + for (dlong jj=B.offd.rowStarts[i]; jjcolMap[B->offd.cols[jj]]; - sendNonZeros[nnzTotal].val = B->offd.vals[jj]; + sendNonZeros[nnzTotal].col = B.colMap[B.offd.cols[jj]]; + sendNonZeros[nnzTotal].val = B.offd.vals[jj]; nnzTotal++; } } } - MPI_Alltoall(sendCounts, 1, MPI_INT, - recvCounts, 1, MPI_INT, A->comm); + A.comm.Alltoall(sendCounts, recvCounts); for (r=0;rcomm); + memory BoffdRows(Boffdnnz); - //clean up - MPI_Barrier(B->comm); - free(sendNonZeros); - free(sendCounts); - free(recvCounts); - free(sendOffsets); - free(recvOffsets); + B.comm.Alltoallv(sendNonZeros, sendCounts, sendOffsets, + BoffdRows, recvCounts, recvOffsets); //we now have all the needed nonlocal rows (should also be sorted by row then col) //make an array of row offsets so we know how large each row is - dlong *BoffdRowOffsets = (dlong *) calloc(A->Ncols-A->NlocalCols+1, sizeof(dlong)); + memory BoffdRowOffsets(A.Ncols-A.NlocalCols+1, 0); dlong id=0; for (dlong n=0;ncolMap[id+A->NlocalCols]!=row) id++; + while(A.colMap[id+A.NlocalCols]!=row) id++; BoffdRowOffsets[id+1]++; //count entry in row } //cumulative sum - for (dlong n=0;nNcols-A->NlocalCols;n++) + for (dlong n=0;nNrows;i++) { + for (dlong i=0;idiag.rowStarts[i]; - dlong end = A->diag.rowStarts[i+1]; + dlong start = A.diag.rowStarts[i]; + dlong end = A.diag.rowStarts[i+1]; for (dlong j=start;jdiag.cols[j]; - const int nnzBj = B->diag.rowStarts[col+1]-B->diag.rowStarts[col] - +B->offd.rowStarts[col+1]-B->offd.rowStarts[col]; + const dlong col = A.diag.cols[j]; + const int nnzBj = B.diag.rowStarts[col+1]-B.diag.rowStarts[col] + +B.offd.rowStarts[col+1]-B.offd.rowStarts[col]; nnzTotal += nnzBj; } //non-local entries - start = A->offd.rowStarts[i]; - end = A->offd.rowStarts[i+1]; + start = A.offd.rowStarts[i]; + end = A.offd.rowStarts[i+1]; for (dlong j=start;joffd.cols[j]-A->NlocalCols; + const dlong col = A.offd.cols[j]-A.NlocalCols; const int nnzBj = BoffdRowOffsets[col+1] - BoffdRowOffsets[col]; nnzTotal += nnzBj; } } - parCOO::nonZero_t *Ctmp = (parCOO::nonZero_t *) - calloc(nnzTotal, sizeof(parCOO::nonZero_t)); + memory Ctmp(nnzTotal); // Fill the intermediate form of C dlong cnt = 0; - for (dlong i=0;iNrows;i++) { + for (dlong i=0;idiag.rowStarts[i]; - dlong end = A->diag.rowStarts[i+1]; + dlong start = A.diag.rowStarts[i]; + dlong end = A.diag.rowStarts[i+1]; for (dlong j=start;jdiag.cols[j]; - const dfloat Aval = A->diag.vals[j]; + const dlong col = A.diag.cols[j]; + const dfloat Aval = A.diag.vals[j]; //local B entries - dlong Bstart = B->diag.rowStarts[col]; - dlong Bend = B->diag.rowStarts[col+1]; + dlong Bstart = B.diag.rowStarts[col]; + dlong Bend = B.diag.rowStarts[col+1]; for (dlong jj=Bstart;jjglobalRowStarts[rank]; - Ctmp[cnt].col = B->diag.cols[jj]+B->globalColStarts[rank]; //global id - Ctmp[cnt].val = Aval*B->diag.vals[jj]; + Ctmp[cnt].row = i + A.globalRowStarts[rank]; + Ctmp[cnt].col = B.diag.cols[jj]+B.globalColStarts[rank]; //global id + Ctmp[cnt].val = Aval*B.diag.vals[jj]; cnt++; } //non-local B entries - Bstart = B->offd.rowStarts[col]; - Bend = B->offd.rowStarts[col+1]; + Bstart = B.offd.rowStarts[col]; + Bend = B.offd.rowStarts[col+1]; for (dlong jj=Bstart;jjglobalRowStarts[rank]; - Ctmp[cnt].col = B->colMap[B->offd.cols[jj]]; //global id - Ctmp[cnt].val = Aval*B->offd.vals[jj]; + Ctmp[cnt].row = i + A.globalRowStarts[rank]; + Ctmp[cnt].col = B.colMap[B.offd.cols[jj]]; //global id + Ctmp[cnt].val = Aval*B.offd.vals[jj]; cnt++; } } //non-local A entries - start = A->offd.rowStarts[i]; - end = A->offd.rowStarts[i+1]; + start = A.offd.rowStarts[i]; + end = A.offd.rowStarts[i+1]; for (dlong j=start;joffd.cols[j]-A->NlocalCols; - const dfloat Aval = A->offd.vals[j]; + const dlong col = A.offd.cols[j]-A.NlocalCols; + const dfloat Aval = A.offd.vals[j]; // entries from recived rows of B dlong Bstart = BoffdRowOffsets[col]; dlong Bend = BoffdRowOffsets[col+1]; for (dlong jj=Bstart;jjglobalRowStarts[rank]; + Ctmp[cnt].row = i + A.globalRowStarts[rank]; Ctmp[cnt].col = BoffdRows[jj].col; //global id Ctmp[cnt].val = Aval*BoffdRows[jj].val; cnt++; } } } - free(BoffdRowOffsets); - free(BoffdRows); - //sort entries by the row and col - std::sort(Ctmp, Ctmp+nnzTotal, + std::sort(Ctmp.ptr(), Ctmp.ptr()+nnzTotal, [](const parCOO::nonZero_t& a, const parCOO::nonZero_t& b) { if (a.row < b.row) return true; if (a.row > b.row) return false; @@ -247,16 +233,14 @@ parCSR *SpMM(parCSR *A, parCSR *B){ if ((Ctmp[i].row!=Ctmp[i-1].row)|| (Ctmp[i].col!=Ctmp[i-1].col)) nnz++; - parCOO cooC(A->platform, A->comm); + parCOO cooC(A.platform, A.comm); //copy global partition - cooC.globalRowStarts = (hlong *) calloc(size+1,sizeof(hlong)); - cooC.globalColStarts = (hlong *) calloc(size+1,sizeof(hlong)); - memcpy(cooC.globalRowStarts, A->globalRowStarts, (size+1)*sizeof(hlong)); - memcpy(cooC.globalColStarts, B->globalColStarts, (size+1)*sizeof(hlong)); + cooC.globalRowStarts = A.globalRowStarts; + cooC.globalColStarts = B.globalColStarts; cooC.nnz = nnz; - cooC.entries = (parCOO::nonZero_t *) calloc(nnz,sizeof(parCOO::nonZero_t)); + cooC.entries.malloc(nnz); //compress nonzeros nnz = 0; @@ -269,11 +253,11 @@ parCSR *SpMM(parCSR *A, parCSR *B){ cooC.entries[nnz-1].val += Ctmp[i].val; } } - //clean up - free(Ctmp); //build C from coo matrix - return new parCSR(cooC); + return parCSR(cooC); } -} //namespace parAlmond \ No newline at end of file +} //namespace parAlmond + +} //namespace libp diff --git a/libs/parAlmond/parAlmondStrongGraph.cpp b/libs/parAlmond/parAlmondStrongGraph.cpp index 6453ad2c3..5fa6a6e1e 100644 --- a/libs/parAlmond/parAlmondStrongGraph.cpp +++ b/libs/parAlmond/parAlmondStrongGraph.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,12 +27,14 @@ SOFTWARE. #include "parAlmond.hpp" #include "parAlmond/parAlmondAMGSetup.hpp" +namespace libp { + namespace parAlmond { -static strongGraph_t* RugeStubenStrength(parCSR *A, dfloat theta); -static strongGraph_t* SymmetricStrength(parCSR *A, dfloat theta); +static strongGraph_t RugeStubenStrength(parCSR& A, dfloat theta); +static strongGraph_t SymmetricStrength(parCSR& A, dfloat theta); -strongGraph_t* strongGraph(parCSR *A, StrengthType type, dfloat theta){ +strongGraph_t strongGraph(parCSR& A, StrengthType type, dfloat theta){ if (type==RUGESTUBEN) { return RugeStubenStrength(A, theta); @@ -42,19 +44,18 @@ strongGraph_t* strongGraph(parCSR *A, StrengthType type, dfloat theta){ } -static strongGraph_t* RugeStubenStrength(parCSR *A, dfloat theta) { +static strongGraph_t RugeStubenStrength(parCSR& A, dfloat theta) { - const dlong N = A->Nrows; - const dlong M = A->Ncols; + const dlong N = A.Nrows; + const dlong M = A.Ncols; - strongGraph_t *C = new strongGraph_t(N, M, A->platform, A->comm); + strongGraph_t C(N, M, A.platform, A.comm); - C->rowStarts = (dlong *) calloc(N+1,sizeof(dlong)); + C.rowStarts.malloc(N+1); - dfloat *maxOD = nullptr; - maxOD = (dfloat *) calloc(N,sizeof(dfloat)); + memory maxOD(N,0.0); - dfloat *diagA = A->diagA; + memory diagA = A.diagA; //find maxOD // #pragma omp parallel for @@ -62,49 +63,50 @@ static strongGraph_t* RugeStubenStrength(parCSR *A, dfloat theta) { const int sign = (diagA[i] >= 0) ? 1:-1; //local entries - dlong Jstart = A->diag.rowStarts[i]; - dlong Jend = A->diag.rowStarts[i+1]; + dlong Jstart = A.diag.rowStarts[i]; + dlong Jend = A.diag.rowStarts[i+1]; for(dlong jj= Jstart; jjdiag.cols[jj]; + const dlong col = A.diag.cols[jj]; if (col==i) continue; - const dfloat OD = -sign*A->diag.vals[jj]; + const dfloat OD = -sign*A.diag.vals[jj]; if(OD > maxOD[i]) maxOD[i] = OD; } //non-local entries - Jstart = A->offd.rowStarts[i]; - Jend = A->offd.rowStarts[i+1]; + Jstart = A.offd.rowStarts[i]; + Jend = A.offd.rowStarts[i+1]; for(dlong jj= Jstart; jjoffd.vals[jj]; + dfloat OD = -sign*A.offd.vals[jj]; if(OD > maxOD[i]) maxOD[i] = OD; } int strong_per_row = 1; // diagonal entry //local entries - Jstart = A->diag.rowStarts[i]; - Jend = A->diag.rowStarts[i+1]; + Jstart = A.diag.rowStarts[i]; + Jend = A.diag.rowStarts[i+1]; for(dlong jj = Jstart; jjdiag.cols[jj]; + const dlong col = A.diag.cols[jj]; if (col==i) continue; - const dfloat OD = -sign*A->diag.vals[jj]; + const dfloat OD = -sign*A.diag.vals[jj]; if(OD > theta*maxOD[i]) strong_per_row++; } //non-local entries - Jstart = A->offd.rowStarts[i]; - Jend = A->offd.rowStarts[i+1]; + Jstart = A.offd.rowStarts[i]; + Jend = A.offd.rowStarts[i+1]; for(dlong jj= Jstart; jjoffd.vals[jj]; + const dfloat OD = -sign*A.offd.vals[jj]; if(OD > theta*maxOD[i]) strong_per_row++; } - C->rowStarts[i+1] = strong_per_row; + C.rowStarts[i+1] = strong_per_row; } // cumulative sum + C.rowStarts[0] = 0.0; for(dlong i=1; irowStarts[i] += C->rowStarts[i-1]; + C.rowStarts[i] += C.rowStarts[i-1]; } - C->nnz = C->rowStarts[N]; - C->cols = (dlong *) malloc(C->nnz*sizeof(dlong)); + C.nnz = C.rowStarts[N]; + C.cols.malloc(C.nnz); // fill in the columns for strong connections @@ -112,124 +114,126 @@ static strongGraph_t* RugeStubenStrength(parCSR *A, dfloat theta) { for(dlong i=0; i= 0) ? 1:-1; - dlong counter = C->rowStarts[i]; + dlong counter = C.rowStarts[i]; //local entries - dlong Jstart = A->diag.rowStarts[i]; - dlong Jend = A->diag.rowStarts[i+1]; + dlong Jstart = A.diag.rowStarts[i]; + dlong Jend = A.diag.rowStarts[i+1]; for(dlong jj = Jstart; jjdiag.cols[jj]; + const dlong col = A.diag.cols[jj]; if (col==i) { - C->cols[counter++] = col;// diag entry + C.cols[counter++] = col;// diag entry continue; } - const dfloat OD = -sign*A->diag.vals[jj]; + const dfloat OD = -sign*A.diag.vals[jj]; if(OD > theta*maxOD[i]) - C->cols[counter++] = col; + C.cols[counter++] = col; } //nonlocal entries - Jstart = A->offd.rowStarts[i]; - Jend = A->offd.rowStarts[i+1]; + Jstart = A.offd.rowStarts[i]; + Jend = A.offd.rowStarts[i+1]; for(dlong jj = Jstart; jjoffd.cols[jj]; - const dfloat OD = -sign*A->offd.vals[jj]; + const dlong col = A.offd.cols[jj]; + const dfloat OD = -sign*A.offd.vals[jj]; if(OD > theta*maxOD[i]) - C->cols[counter++] = col; + C.cols[counter++] = col; } } - free(maxOD); return C; } -static strongGraph_t* SymmetricStrength(parCSR *A, dfloat theta) { +static strongGraph_t SymmetricStrength(parCSR& A, dfloat theta) { - const dlong N = A->Nrows; - const dlong M = A->Ncols; + const dlong N = A.Nrows; + const dlong M = A.Ncols; - strongGraph_t *C = new strongGraph_t(N, M, A->platform, A->comm); + strongGraph_t C(N, M, A.platform, A.comm); - C->rowStarts = (dlong *) calloc(N+1,sizeof(dlong)); + C.rowStarts.malloc(N+1); - dfloat *diagA = A->diagA; + memory diagA = A.diagA; // #pragma omp parallel for for(dlong i=0; idiag.rowStarts[i]; - dlong Jend = A->diag.rowStarts[i+1]; + dlong Jstart = A.diag.rowStarts[i]; + dlong Jend = A.diag.rowStarts[i+1]; for(dlong jj= Jstart; jjdiag.cols[jj]; + const dlong col = A.diag.cols[jj]; if (col==i) continue; - const dfloat Ajj = fabs(diagA[col]); + const dfloat Ajj = std::abs(diagA[col]); - if(fabs(A->diag.vals[jj]) > theta*(sqrt(Aii*Ajj))) + if(std::abs(A.diag.vals[jj]) > theta*(sqrt(Aii*Ajj))) strong_per_row++; } //non-local entries - Jstart = A->offd.rowStarts[i]; - Jend = A->offd.rowStarts[i+1]; + Jstart = A.offd.rowStarts[i]; + Jend = A.offd.rowStarts[i+1]; for(dlong jj= Jstart; jjoffd.cols[jj]; - const dfloat Ajj = fabs(diagA[col]); + const dlong col = A.offd.cols[jj]; + const dfloat Ajj = std::abs(diagA[col]); - if(fabs(A->offd.vals[jj]) > theta*(sqrt(Aii*Ajj))) + if(std::abs(A.offd.vals[jj]) > theta*(sqrt(Aii*Ajj))) strong_per_row++; } - C->rowStarts[i+1] = strong_per_row; + C.rowStarts[i+1] = strong_per_row; } // cumulative sum + C.rowStarts[0] = 0; for(dlong i=1; irowStarts[i] += C->rowStarts[i-1]; + C.rowStarts[i] += C.rowStarts[i-1]; } - C->nnz = C->rowStarts[N]; - C->cols = (dlong *) malloc(C->nnz*sizeof(dlong)); + C.nnz = C.rowStarts[N]; + C.cols.malloc(C.nnz); // fill in the columns for strong connections // #pragma omp parallel for for(dlong i=0; irowStarts[i]; + dlong counter = C.rowStarts[i]; //local entries - dlong Jstart = A->diag.rowStarts[i]; - dlong Jend = A->diag.rowStarts[i+1]; + dlong Jstart = A.diag.rowStarts[i]; + dlong Jend = A.diag.rowStarts[i+1]; for(dlong jj= Jstart; jjdiag.cols[jj]; + const dlong col = A.diag.cols[jj]; if (col==i) { - C->cols[counter++] = col;// diag entry + C.cols[counter++] = col;// diag entry continue; } - const dfloat Ajj = fabs(diagA[col]); + const dfloat Ajj = std::abs(diagA[col]); - if(fabs(A->diag.vals[jj]) > theta*(sqrt(Aii*Ajj))) - C->cols[counter++] = col; + if(std::abs(A.diag.vals[jj]) > theta*(sqrt(Aii*Ajj))) + C.cols[counter++] = col; } //non-local entries - Jstart = A->offd.rowStarts[i]; - Jend = A->offd.rowStarts[i+1]; + Jstart = A.offd.rowStarts[i]; + Jend = A.offd.rowStarts[i+1]; for(dlong jj= Jstart; jjoffd.cols[jj]; + const dlong col = A.offd.cols[jj]; - const dfloat Ajj = fabs(diagA[col]); + const dfloat Ajj = std::abs(diagA[col]); - if(fabs(A->offd.vals[jj]) > theta*(sqrt(Aii*Ajj))) - C->cols[counter++] = col; + if(std::abs(A.offd.vals[jj]) > theta*(sqrt(Aii*Ajj))) + C.cols[counter++] = col; } } return C; } -} //namespace parAlmond \ No newline at end of file +} //namespace parAlmond + +} //namespace libp diff --git a/libs/parAlmond/parAlmondTentativeProlongator.cpp b/libs/parAlmond/parAlmondTentativeProlongator.cpp index 8d93caba9..8632a3399 100644 --- a/libs/parAlmond/parAlmondTentativeProlongator.cpp +++ b/libs/parAlmond/parAlmondTentativeProlongator.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,29 +27,28 @@ SOFTWARE. #include "parAlmond.hpp" #include "parAlmond/parAlmondAMGSetup.hpp" +namespace libp { + namespace parAlmond { -parCSR *tentativeProlongator(parCSR *A, hlong *FineToCoarse, - hlong *globalAggStarts, dfloat *null){ +parCSR tentativeProlongator(parCSR& A, memory FineToCoarse, + memory globalAggStarts, memory null){ - int rank, size; - MPI_Comm_rank(A->comm, &rank); - MPI_Comm_size(A->comm, &size); + int rank = A.comm.rank(); + // int size = A.comm.size(); - const dlong NCoarse = (dlong) (globalAggStarts[rank+1]-globalAggStarts[rank]); //local num agg + const dlong NCoarse = static_cast(globalAggStarts[rank+1]-globalAggStarts[rank]); //local num agg - parCOO cooP(A->platform, A->comm); + parCOO cooP(A.platform, A.comm); //copy global partition - cooP.globalRowStarts = (hlong *) calloc(size+1,sizeof(hlong)); - cooP.globalColStarts = (hlong *) calloc(size+1,sizeof(hlong)); - memcpy(cooP.globalRowStarts, A->globalRowStarts, (size+1)*sizeof(hlong)); - memcpy(cooP.globalColStarts, globalAggStarts, (size+1)*sizeof(hlong)); + cooP.globalRowStarts = A.globalRowStarts; + cooP.globalColStarts = globalAggStarts; - const hlong globalRowOffset = A->globalRowStarts[rank]; + const hlong globalRowOffset = A.globalRowStarts[rank]; - cooP.nnz = A->Nrows; - cooP.entries = (parCOO::nonZero_t *) malloc(cooP.nnz*sizeof(parCOO::nonZero_t)); + cooP.nnz = A.Nrows; + cooP.entries.malloc(cooP.nnz); for(dlong n=0; nNcols > A->Ncols) - LIBP_ABORT(string("Size of Coarse nullvector is too large, need to re-alloc")) + LIBP_ABORT("Size of Coarse nullvector is too large, need to re-alloc", + P.Ncols > A.Ncols); //set coarse null to 0 - for(dlong i=0; iNcols; i++) null[i] = 0.0; + for(dlong i=0; idiag.nnz; i++) - null[P->diag.cols[i]] += P->diag.vals[i] * P->diag.vals[i]; + for(dlong i=0; ioffd.nnz; i++) - null[P->offd.cols[i]] += P->offd.vals[i] * P->offd.vals[i]; + for(dlong i=0; ihalo->Combine(null, 1, ogs_dfloat); + P.halo.Combine(null, 1); for(dlong i=0; ihalo->Exchange(null, 1, ogs_dfloat); + P.halo.Exchange(null, 1); - for(dlong i=0; idiag.nnz; i++) - P->diag.vals[i] /= null[P->diag.cols[i]]; - for(dlong i=0; ioffd.nnz; i++) - P->offd.vals[i] /= null[P->offd.cols[i]]; + for(dlong i=0; icomm, &rank); - MPI_Comm_size(A->comm, &size); + int rank = A.comm.rank(); + int size = A.comm.size(); // copy data from nonlocal entries into send buffer - parCOO::nonZero_t *sendNonZeros = (parCOO::nonZero_t *) - calloc(A->offd.nnz, sizeof(parCOO::nonZero_t)); - for(dlong i=0;ioffd.nzRows;++i){ - const hlong row = A->offd.rows[i] + A->globalRowStarts[rank]; //global ids - for (dlong j=A->offd.mRowStarts[i];joffd.mRowStarts[i+1];j++) { - const hlong col = A->colMap[A->offd.cols[j]]; //global ids + memory sendNonZeros(A.offd.nnz); + for(dlong i=0;ioffd.vals[j]; + sendNonZeros[j].val = A.offd.vals[j]; } } //sort by destination row - std::sort(sendNonZeros, sendNonZeros+A->offd.nnz, + std::sort(sendNonZeros.ptr(), sendNonZeros.ptr()+A.offd.nnz, [](const parCOO::nonZero_t& a, const parCOO::nonZero_t& b) { if (a.row < b.row) return true; if (a.row > b.row) return false; @@ -59,21 +59,22 @@ parCSR *transpose(parCSR *A){ }); //count number of non-zeros we're sending - int *sendCounts = (int*) calloc(size, sizeof(int)); - int *recvCounts = (int*) calloc(size, sizeof(int)); - int *sendOffsets = (int*) calloc(size+1, sizeof(int)); - int *recvOffsets = (int*) calloc(size+1, sizeof(int)); + memory sendCounts(size, 0); + memory recvCounts(size); + memory sendOffsets(size+1); + memory recvOffsets(size+1); int r=0; - for (dlong n=0;noffd.nnz;n++) { + for (dlong n=0;n=A->globalColStarts[r+1]) r++; + while(row>=A.globalColStarts[r+1]) r++; sendCounts[r]++; } - MPI_Alltoall(sendCounts, 1, MPI_INT, - recvCounts, 1, MPI_INT, A->comm); + A.comm.Alltoall(sendCounts, recvCounts); + sendOffsets[0] = 0; + recvOffsets[0] = 0; for (r=0;rplatform, A->comm); + parCOO cooAt(A.platform, A.comm); //copy global partition - cooAt.globalRowStarts = (hlong *) calloc(size+1,sizeof(hlong)); - cooAt.globalColStarts = (hlong *) calloc(size+1,sizeof(hlong)); - memcpy(cooAt.globalRowStarts, A->globalColStarts, (size+1)*sizeof(hlong)); - memcpy(cooAt.globalColStarts, A->globalRowStarts, (size+1)*sizeof(hlong)); + cooAt.globalRowStarts = A.globalColStarts; + cooAt.globalColStarts = A.globalRowStarts; - cooAt.nnz = A->diag.nnz+offdnnz; - cooAt.entries = (parCOO::nonZero_t *) calloc(cooAt.nnz, sizeof(parCOO::nonZero_t)); + cooAt.nnz = A.diag.nnz+offdnnz; + cooAt.entries.malloc(cooAt.nnz); //fill local nonzeros - for(dlong i=0; iNrows; i++){ - const dlong Jstart = A->diag.rowStarts[i]; - const dlong Jend = A->diag.rowStarts[i+1]; + for(dlong i=0; idiag.cols[jj] + A->globalColStarts[rank]; - cooAt.entries[jj].col = i + A->globalRowStarts[rank]; - cooAt.entries[jj].val = A->diag.vals[jj]; + cooAt.entries[jj].row = A.diag.cols[jj] + A.globalColStarts[rank]; + cooAt.entries[jj].col = i + A.globalRowStarts[rank]; + cooAt.entries[jj].val = A.diag.vals[jj]; } } // receive non-local nonzeros - MPI_Alltoallv(sendNonZeros, sendCounts, sendOffsets, MPI_NONZERO_T, - cooAt.entries+A->diag.nnz, recvCounts, recvOffsets, MPI_NONZERO_T, - A->comm); - - //clean up - MPI_Barrier(A->comm); - free(sendNonZeros); - free(sendCounts); - free(recvCounts); - free(sendOffsets); - free(recvOffsets); + A.comm.Alltoallv(sendNonZeros, sendCounts, sendOffsets, + cooAt.entries+A.diag.nnz, recvCounts, recvOffsets); //sort by row - std::sort(cooAt.entries, cooAt.entries+cooAt.nnz, + std::sort(cooAt.entries.ptr(), cooAt.entries.ptr()+cooAt.nnz, [](const parCOO::nonZero_t& a, const parCOO::nonZero_t& b) { if (a.row < b.row) return true; if (a.row > b.row) return false; @@ -126,7 +116,9 @@ parCSR *transpose(parCSR *A){ return a.col < b.col; }); - return new parCSR(cooAt); + return parCSR(cooAt); } -} //namespace parAlmond \ No newline at end of file +} //namespace parAlmond + +} //namespace libp diff --git a/libs/parAlmond/parAlmondVcycle.cpp b/libs/parAlmond/parAlmondVcycle.cpp index 8c232d945..d7227ec41 100644 --- a/libs/parAlmond/parAlmondVcycle.cpp +++ b/libs/parAlmond/parAlmondVcycle.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,11 +25,13 @@ SOFTWARE. */ #include "parAlmond.hpp" -#include "parAlmond/parAlmondMultigrid.hpp" +#include "parAlmond/parAlmondCoarseSolver.hpp" + +namespace libp { namespace parAlmond { -void multigrid_t::vcycle(const int k, occa::memory& o_RHS, occa::memory& o_X){ +void multigrid_t::vcycle(const int k, deviceMemory& o_RHS, deviceMemory& o_X){ //check for base level if(k==baseLevel) { @@ -37,24 +39,26 @@ void multigrid_t::vcycle(const int k, occa::memory& o_RHS, occa::memory& o_X){ return; } - multigridLevel *level = levels[k]; - occa::memory& o_RHSC = o_rhs[k+1]; - occa::memory& o_XC = o_x[k+1]; - occa::memory& o_RES = o_scratch; + multigridLevel& level = *levels[k]; + deviceMemory& o_RHSC = o_rhs[k+1]; + deviceMemory& o_XC = o_x[k+1]; + deviceMemory& o_RES = o_scratch; //apply smoother to x and then compute res = rhs-Ax - level->smooth(o_RHS, o_X, true); - level->residual(o_RHS, o_X, o_RES); + level.smooth(o_RHS, o_X, true); + level.residual(o_RHS, o_X, o_RES); // rhsC = P^T res - level->coarsen(o_RES, o_RHSC); + level.coarsen(o_RES, o_RHSC); vcycle(k+1, o_RHSC, o_XC); // x = x + P xC - level->prolongate(o_XC, o_X); + level.prolongate(o_XC, o_X); - level->smooth(o_RHS, o_X, false); + level.smooth(o_RHS, o_X, false); } } //namespace parAlmond + +} //namespace libp diff --git a/libs/parAlmond/parAlmondparCSR.cpp b/libs/parAlmond/parAlmondparCSR.cpp index 9339ba003..e579fbbf1 100644 --- a/libs/parAlmond/parAlmondparCSR.cpp +++ b/libs/parAlmond/parAlmondparCSR.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -28,6 +28,8 @@ SOFTWARE. #include "parAlmond/parAlmondparCSR.hpp" #include "parAlmond/parAlmondKernels.hpp" +namespace libp { + namespace parAlmond { //------------------------------------------------------------------------ @@ -36,8 +38,10 @@ namespace parAlmond { // //------------------------------------------------------------------------ -void parCSR::SpMV(const dfloat alpha, dfloat *x, - const dfloat beta, dfloat *y) { +void parCSR::SpMV(const dfloat alpha, memory& x, + const dfloat beta, memory& y) { + + halo.ExchangeStart(x, 1); // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j]) // #pragma omp parallel for @@ -52,7 +56,7 @@ void parCSR::SpMV(const dfloat alpha, dfloat *x, y[i] = alpha*result; } - halo->Exchange(x, 1, ogs_dfloat); + halo.ExchangeFinish(x, 1); // #pragma omp parallel for for(dlong i=0; i& x, + const dfloat beta, const memory& y, memory& z) { + + halo.ExchangeStart(x, 1); // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j]) // #pragma omp parallel for @@ -78,7 +84,7 @@ void parCSR::SpMV(const dfloat alpha, dfloat *x, z[i] = alpha*result + beta*y[i]; } - halo->Exchange(x, 1, ogs_dfloat); + halo.ExchangeFinish(x, 1); for(dlong i=0; i& o_x, const dfloat beta, + deviceMemory& o_y) { - halo->ExchangeStart(o_x, 1, ogs_dfloat); + halo.ExchangeStart(o_x, 1); // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j]) if (diag.NrowBlocks) @@ -102,7 +108,7 @@ void parCSR::SpMV(const dfloat alpha, occa::memory& o_x, const dfloat beta, diag.o_cols, diag.o_vals, o_x, o_y); - halo->ExchangeFinish(o_x, 1, ogs_dfloat); + halo.ExchangeFinish(o_x, 1); const dfloat one = 1.0; if (offd.NrowBlocks) @@ -112,10 +118,10 @@ void parCSR::SpMV(const dfloat alpha, occa::memory& o_x, const dfloat beta, o_x, o_y); } -void parCSR::SpMV(const dfloat alpha, occa::memory& o_x, const dfloat beta, - occa::memory& o_y, occa::memory& o_z) { +void parCSR::SpMV(const dfloat alpha, deviceMemory& o_x, const dfloat beta, + deviceMemory& o_y, deviceMemory& o_z) { - halo->ExchangeStart(o_x, 1, ogs_dfloat); + halo.ExchangeStart(o_x, 1); // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j]) if (diag.NrowBlocks) @@ -124,7 +130,7 @@ void parCSR::SpMV(const dfloat alpha, occa::memory& o_x, const dfloat beta, diag.o_cols, diag.o_vals, o_x, o_y, o_z); - halo->ExchangeFinish(o_x, 1, ogs_dfloat); + halo.ExchangeFinish(o_x, 1); const dfloat one = 1.0; if (offd.NrowBlocks) @@ -146,25 +152,21 @@ parCSR::parCSR(parCOO& A): // number of nonzeros on this rank platform(A.platform), comm(A.comm) { - int rank; - int size; - MPI_Comm_rank(comm, &rank); - MPI_Comm_size(comm, &size); + int rank = comm.rank(); + // int size = comm.size(); //copy global partition - globalRowStarts = (hlong *) calloc(size+1,sizeof(hlong)); - globalColStarts = (hlong *) calloc(size+1,sizeof(hlong)); - memcpy(globalRowStarts, A.globalRowStarts, (size+1)*sizeof(hlong)); - memcpy(globalColStarts, A.globalColStarts, (size+1)*sizeof(hlong)); + globalRowStarts = A.globalRowStarts; + globalColStarts = A.globalColStarts; const hlong globalRowOffset = globalRowStarts[rank]; const hlong globalColOffset = globalColStarts[rank]; - Nrows = (dlong)(globalRowStarts[rank+1]-globalRowStarts[rank]); - Ncols = (dlong)(globalColStarts[rank+1]-globalColStarts[rank]); + Nrows = static_cast(globalRowStarts[rank+1]-globalRowStarts[rank]); + Ncols = static_cast(globalColStarts[rank+1]-globalColStarts[rank]); - diag.rowStarts = (dlong *) calloc(Nrows+1, sizeof(dlong)); - offd.rowStarts = (dlong *) calloc(Nrows+1, sizeof(dlong)); + diag.rowStarts.malloc(Nrows+1, 0); + offd.rowStarts.malloc(Nrows+1, 0); //count the entries in each row for (dlong n=0;n0) offd.nzRows++; - offd.rows = (dlong *) calloc(offd.nzRows, sizeof(dlong)); - offd.mRowStarts = (dlong *) calloc(offd.nzRows+1, sizeof(dlong)); + offd.rows.malloc(offd.nzRows); + offd.mRowStarts.malloc(offd.nzRows+1); // cumulative sum dlong cnt=0; + offd.mRowStarts[0] = 0; for(dlong i=0; i0) { offd.rows[cnt] = i; //record row id @@ -201,7 +204,7 @@ parCSR::parCSR(parCOO& A): // number of nonzeros on this rank // Halo setup cnt=0; - hlong *colIds = (hlong *) malloc(offd.nnz*sizeof(hlong)); + memory colIds(offd.nnz); for (dlong n=0;n globalColOffset+Ncols-1)) @@ -210,10 +213,10 @@ parCSR::parCSR(parCOO& A): // number of nonzeros on this rank haloSetup(colIds); //setup halo, and transform colIds to a local indexing //fill the CSR matrices - diag.cols = (dlong *) calloc(diag.nnz, sizeof(dlong)); - offd.cols = (dlong *) calloc(offd.nnz, sizeof(dlong)); - diag.vals = (pfloat *) calloc(diag.nnz, sizeof(pfloat)); - offd.vals = (pfloat *) calloc(offd.nnz, sizeof(pfloat)); + diag.cols.malloc(diag.nnz); + offd.cols.malloc(offd.nnz); + diag.vals.malloc(diag.nnz); + offd.vals.malloc(offd.nnz); dlong diagCnt = 0; dlong offdCnt = 0; for (dlong n=0;n(A.entries[n].col - globalColOffset); diag.vals[diagCnt] = A.entries[n].val; diagCnt++; } } - free(colIds); } //------------------------------------------------------------------------ @@ -247,15 +249,14 @@ typedef struct { } parallelId_t; -void parCSR::haloSetup(hlong *colIds) { +void parCSR::haloSetup(memory colIds) { - int rank; - MPI_Comm_rank(comm, &rank); + int rank = comm.rank(); const hlong globalOffset = globalColStarts[rank]; //collect the unique nonlocal column ids - parallelId_t* parIds = (parallelId_t*) malloc(offd.nnz*sizeof(parallelId_t)); + memory parIds(offd.nnz); for (dlong n=0;n b.globalId) return false; @@ -283,7 +284,7 @@ void parCSR::haloSetup(hlong *colIds) { if(offd.nnz) Noffdcols++; //record the global ids of the unique columns - hlong *offdcols = (hlong *) malloc(Noffdcols*sizeof(hlong)); + memory offdcols(Noffdcols); Noffdcols = 0; if(offd.nnz) offdcols[Noffdcols++] = parIds[0].globalId; for (dlong n=1;n b.localId) return false; @@ -304,22 +305,20 @@ void parCSR::haloSetup(hlong *colIds) { Ncols += Noffdcols; //make an array of all the column ids required on this rank (local first) - colMap = (hlong*) malloc(Ncols*sizeof(hlong)); + colMap.malloc(Ncols); for (dlong n=0; nExchange(diagA, 1, ogs_dfloat); + halo.Exchange(diagA, 1); //compute the inverse diagonal for (dlong n=0;nFree(); -} //------------------------------------------------------------------------ // @@ -389,44 +363,44 @@ parCSR::~parCSR() { dfloat parCSR::rhoDinvA(){ - int size; - MPI_Comm_size(comm, &size); + int size = comm.size(); int k = 10; hlong Ntotal = globalRowStarts[size]; - if(k > Ntotal) k = (int) Ntotal; + if(k > Ntotal) k = static_cast(Ntotal); // do an arnoldi // allocate memory for Hessenberg matrix - double *H = (double *) calloc(k*k,sizeof(double)); + memory H(k*k, 0.0); // allocate memory for basis - dfloat **V = (dfloat **) calloc(k+1, sizeof(dfloat *)); - dfloat *Vx = (dfloat *) calloc(Ncols, sizeof(dfloat)); + memory> V(k+1); + memory Vx(Ncols); - for(int i=0; i<=k; i++) - V[i] = (dfloat *) calloc(Nrows, sizeof(dfloat)); + for(int i=0; i<=k; i++) { + V[i].malloc(Nrows); + } // generate a random vector for initial basis vector for(dlong n=0; n(hij); } if(j+1 < k){ // dfloat norm_vj = vectorNorm(Nrows,V[j+1],comm); - dfloat norm_vj=0.0, gnorm_vj=0.0; + dfloat norm_vj=0.0; for(dlong n=0; n(norm_vj); // vectorScale(Nrows, 1./H[j+1 + j*k], V[j+1]); for(dlong n=0; n WR(k); + memory WI(k); - matrixEigenValues(k, H, WR, WI); + linAlg_t::matrixEigenValues(k, H, WR, WI); double RHO = 0.; @@ -478,15 +452,6 @@ dfloat parCSR::rhoDinvA(){ } } - free(H); - free(WR); - free(WI); - - // free memory - for(int i=0; i<=k; i++) free(V[i]); - free(Vx); - free(V); - // printf("weight = %g \n", RHO); return RHO; @@ -496,7 +461,7 @@ void parCSR::syncToDevice() { if (Nrows) { //transfer matrix data - diag.o_rowStarts = platform.malloc((Nrows+1)*sizeof(dlong), diag.rowStarts); + diag.o_rowStarts = platform.malloc(diag.rowStarts); diag.NrowBlocks=0; if (diag.nnz) { @@ -506,12 +471,9 @@ void parCSR::syncToDevice() { for (dlong i=0;i parAlmond::NonzerosPerBlock) { - //this row is pathalogically big. We can't currently run this - stringstream ss; - ss << "Multiplicity of row: " << i << " is " << rowSize << " in parAlmond::parCSR setup and is too large."; - LIBP_ABORT(ss.str()) - } + //this may be pathalogically big. We can't currently run this + LIBP_ABORT("Multiplicity of row: " << i << " is " << rowSize << " in parAlmond::parCSR setup and is too large.", + rowSize > parAlmond::NonzerosPerBlock); if (blockSum+rowSize > parAlmond::NonzerosPerBlock) { //adding this row will exceed the nnz per block diag.NrowBlocks++; //count the previous block @@ -521,7 +483,7 @@ void parCSR::syncToDevice() { } } - diag.blockRowStarts = (dlong*) calloc(diag.NrowBlocks+1,sizeof(dlong)); + diag.blockRowStarts.malloc(diag.NrowBlocks+1, 0); blockSum=0; diag.NrowBlocks=1; @@ -536,11 +498,11 @@ void parCSR::syncToDevice() { } } diag.blockRowStarts[diag.NrowBlocks] = Nrows; - diag.o_blockRowStarts = platform.malloc((diag.NrowBlocks+1)*sizeof(dlong), diag.blockRowStarts); + diag.o_blockRowStarts = platform.malloc(diag.blockRowStarts); //transfer matrix data - diag.o_cols = platform.malloc(diag.nnz*sizeof(dlong), diag.cols); - diag.o_vals = platform.malloc(diag.nnz*sizeof(pfloat), diag.vals); + diag.o_cols = platform.malloc(diag.cols); + diag.o_vals = platform.malloc(diag.vals); } if (offd.nnz) { @@ -550,12 +512,9 @@ void parCSR::syncToDevice() { for (dlong i=0;i parAlmond::NonzerosPerBlock) { - //this row is pathalogically big. We can't currently run this - stringstream ss; - ss << "Multiplicity of row: " << i << " is " << rowSize << " in parAlmond::parCSR setup and is too large."; - LIBP_ABORT(ss.str()) - } + //this row may be pathalogically big. We can't currently run this + LIBP_ABORT("Multiplicity of row: " << i << " is " << rowSize << " in parAlmond::parCSR setup and is too large.", + rowSize > parAlmond::NonzerosPerBlock); if (blockSum+rowSize > parAlmond::NonzerosPerBlock) { //adding this row will exceed the nnz per block offd.NrowBlocks++; //count the previous block @@ -565,7 +524,7 @@ void parCSR::syncToDevice() { } } - offd.blockRowStarts = (dlong*) calloc(offd.NrowBlocks+1,sizeof(dlong)); + offd.blockRowStarts.malloc(offd.NrowBlocks+1, 0); blockSum=0; offd.NrowBlocks=1; @@ -580,21 +539,23 @@ void parCSR::syncToDevice() { } } offd.blockRowStarts[offd.NrowBlocks] = offd.nzRows; - offd.o_blockRowStarts = platform.malloc((offd.NrowBlocks+1)*sizeof(dlong), offd.blockRowStarts); + offd.o_blockRowStarts = platform.malloc(offd.blockRowStarts); //transfer matrix data - offd.o_rows = platform.malloc(offd.nzRows*sizeof(dlong), offd.rows); - offd.o_mRowStarts = platform.malloc((offd.nzRows+1)*sizeof(dlong), offd.mRowStarts); + offd.o_rows = platform.malloc(offd.rows); + offd.o_mRowStarts = platform.malloc(offd.mRowStarts); - offd.o_cols = platform.malloc(offd.nnz*sizeof(dlong), offd.cols); - offd.o_vals = platform.malloc(offd.nnz*sizeof(pfloat), offd.vals); + offd.o_cols = platform.malloc(offd.cols); + offd.o_vals = platform.malloc(offd.vals); } - if (diagA) { - o_diagA = platform.malloc(Nrows*sizeof(dfloat), diagA); - o_diagInv = platform.malloc(Nrows*sizeof(dfloat), diagInv); + if (diagA.size()) { + o_diagA = platform.malloc(diagA); + o_diagInv = platform.malloc(diagInv); } } } } //namespace parAlmond + +} //namespace libp diff --git a/libs/timeStepper/okl/timeStepperAB.okl b/libs/timeStepper/okl/timeStepperAB.okl index 301ad129a..62595718c 100644 --- a/libs/timeStepper/okl/timeStepperAB.okl +++ b/libs/timeStepper/okl/timeStepperAB.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/libs/timeStepper/okl/timeStepperDOPRI5.okl b/libs/timeStepper/okl/timeStepperDOPRI5.okl index 0965545f4..f4916e8fa 100644 --- a/libs/timeStepper/okl/timeStepperDOPRI5.okl +++ b/libs/timeStepper/okl/timeStepperDOPRI5.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -132,28 +132,19 @@ SOFTWARE. } } - @barrier("local"); #if p_blockSize>512 for(int t=0;t256 for(int t=0;t512 for(int t=0;t256 for(int t=0;t& o_q, + dfloat start, dfloat end) { + assertInitialized(); + ts->Run(solver, o_q, start, end); +} + +void timeStepper_t::SetTimeStep(dfloat dt_) { + assertInitialized(); + ts->SetTimeStep(dt_); +} + +dfloat timeStepper_t::GetTimeStep() { + assertInitialized(); + return ts->dt; +} + +dfloat timeStepper_t::GetGamma() { + assertInitialized(); + return ts->GetGamma(); +} + +void timeStepper_t::assertInitialized() { + LIBP_ABORT("timeStepper_t not initialized", + ts==nullptr); +} + +} //namespace libp diff --git a/libs/timeStepper/timeStepperAB3.cpp b/libs/timeStepper/timeStepperAB3.cpp index 3c60e01eb..fc2249e31 100644 --- a/libs/timeStepper/timeStepperAB3.cpp +++ b/libs/timeStepper/timeStepperAB3.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,23 +27,28 @@ SOFTWARE. #include "core.hpp" #include "timeStepper.hpp" +namespace libp { + namespace TimeStepper { /* Adams Bashforth, order 3 */ ab3::ab3(dlong Nelements, dlong NhaloElements, - int Np, int Nfields, solver_t& _solver): - timeStepper_t(Nelements, NhaloElements, Np, Nfields, _solver) { - - platform_t &platform = solver.platform; + int Np, int Nfields, + platform_t& _platform, comm_t _comm): + timeStepperBase_t(Nelements, NhaloElements, Np, Nfields, + _platform, _comm) { Nstages = 3; shiftIndex = 0; - o_rhsq = platform.malloc(Nstages*N*sizeof(dfloat)); + memory rhsq(Nstages*N,0.0); + o_rhsq = platform.malloc(rhsq); + + properties_t kernelInfo = platform.props(); //copy base occa properties from solver - occa::properties kernelInfo = platform.props; //copy base occa properties from solver + const int blocksize=256; - kernelInfo["defines/" "p_blockSize"] = BLOCKSIZE; + kernelInfo["defines/" "p_blockSize"] = blocksize; kernelInfo["defines/" "p_Nstages"] = Nstages; updateKernel = platform.buildKernel(TIMESTEPPER_DIR "/okl/" @@ -57,19 +62,19 @@ ab3::ab3(dlong Nelements, dlong NhaloElements, 3./2., -1./2., 0.0, 23./12., -16./12., 5./12.}; - ab_a = (dfloat*) calloc(Nstages*Nstages, sizeof(dfloat)); - memcpy(ab_a, _ab_a, Nstages*Nstages*sizeof(dfloat)); + ab_a.malloc(Nstages*Nstages); + ab_a.copyFrom(_ab_a); - o_ab_a = platform.malloc(Nstages*Nstages*sizeof(dfloat), ab_a); + o_ab_a = platform.malloc(ab_a); } -void ab3::Run(occa::memory &o_q, dfloat start, dfloat end) { +void ab3::Run(solver_t& solver, deviceMemory &o_q, dfloat start, dfloat end) { dfloat time = start; solver.Report(time,0); - dfloat outputInterval; + dfloat outputInterval=0.0; solver.settings.getSetting("OUTPUT INTERVAL", outputInterval); dfloat outputTime = time + outputInterval; @@ -77,7 +82,7 @@ void ab3::Run(occa::memory &o_q, dfloat start, dfloat end) { int tstep=0; int order=0; while (time < end) { - Step(o_q, time, dt, order); + Step(solver, o_q, time, dt, order); time += dt; tstep++; if (order &o_q, dfloat time, dfloat _dt, int order) { //rhs at current index - occa::memory o_rhsq0 = o_rhsq + shiftIndex*N*sizeof(dfloat); + deviceMemory o_rhsq0 = o_rhsq + shiftIndex*N; //A coefficients at current order - occa::memory o_A = o_ab_a + order*Nstages*sizeof(dfloat); + deviceMemory o_A = o_ab_a + order*Nstages; //evaluate ODE rhs = f(q,t) solver.rhsf(o_q, o_rhsq0, time); @@ -113,45 +118,33 @@ void ab3::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) { shiftIndex = (shiftIndex+Nstages-1)%Nstages; } -ab3::~ab3() { - if (o_rhsq.size()) o_rhsq.free(); - if (o_ab_a.size()) o_ab_a.free(); - - if (ab_a) free(ab_a); - - updateKernel.free(); -} - /**************************************************/ /* PML version */ /**************************************************/ /* Adams Bashforth, order 3 */ ab3_pml::ab3_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElements, - int Np, int Nfields, int Npmlfields, solver_t& _solver): - ab3(Nelements, NhaloElements, Np, Nfields, _solver), + int Np, int Nfields, int Npmlfields, + platform_t& _platform, comm_t _comm): + ab3(Nelements, NhaloElements, Np, Nfields, _platform, _comm), Npml(NpmlElements*Np*Npmlfields) { if (Npml) { - platform_t &platform = solver.platform; - - dfloat *pmlq = (dfloat *) calloc(Npml,sizeof(dfloat)); - o_pmlq = platform.malloc(Npml*sizeof(dfloat), pmlq); - free(pmlq); - - o_rhspmlq = platform.malloc(Nstages*Npml*sizeof(dfloat)); + memory pmlq(Npml,0.0); + o_pmlq = platform.malloc(pmlq); + o_rhspmlq = platform.malloc(Nstages*Npml); } } -void ab3_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) { +void ab3_pml::Step(solver_t& solver, deviceMemory &o_q, dfloat time, dfloat _dt, int order) { //rhs at current index - occa::memory o_rhsq0 = o_rhsq + shiftIndex*N*sizeof(dfloat); - occa::memory o_rhspmlq0; - if (Npml) o_rhspmlq0 = o_rhspmlq + shiftIndex*Npml*sizeof(dfloat); + deviceMemory o_rhsq0 = o_rhsq + shiftIndex*N; + deviceMemory o_rhspmlq0; + if (Npml) o_rhspmlq0 = o_rhspmlq + shiftIndex*Npml; //A coefficients at current order - occa::memory o_A = o_ab_a + order*Nstages*sizeof(dfloat); + deviceMemory o_A = o_ab_a + order*Nstages; //evaluate ODE rhs = f(q,t) solver.rhsf_pml(o_q, o_pmlq, o_rhsq0, o_rhspmlq0, time); @@ -175,9 +168,6 @@ void ab3_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) { shiftIndex = (shiftIndex+Nstages-1)%Nstages; } -ab3_pml::~ab3_pml() { - if (o_pmlq.size()) o_pmlq.free(); - if (o_rhspmlq.size()) o_rhspmlq.free(); -} - } //namespace TimeStepper + +} //namespace libp diff --git a/libs/timeStepper/timeStepperDOPRI5.cpp b/libs/timeStepper/timeStepperDOPRI5.cpp index b956aab57..ac0c0237c 100644 --- a/libs/timeStepper/timeStepperDOPRI5.cpp +++ b/libs/timeStepper/timeStepperDOPRI5.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,36 +27,38 @@ SOFTWARE. #include "core.hpp" #include "timeStepper.hpp" +namespace libp { + namespace TimeStepper { dopri5::dopri5(dlong Nelements, dlong NhaloElements, - int Np, int Nfields, solver_t& _solver, MPI_Comm _comm): - timeStepper_t(Nelements, NhaloElements, Np, Nfields, _solver), comm(_comm) { - - platform_t &platform = solver.platform; + int Np, int Nfields, + platform_t& _platform, comm_t _comm): + timeStepperBase_t(Nelements, NhaloElements, + Np, Nfields, _platform, _comm) { Nrk = 7; - o_rhsq = platform.malloc(N*sizeof(dfloat)); - o_rkq = platform.malloc((N+Nhalo)*sizeof(dfloat)); - o_rkrhsq = platform.malloc(Nrk*N*sizeof(dfloat)); - o_rkerr = platform.malloc(N*sizeof(dfloat)); + o_rhsq = platform.malloc(N); + o_rkq = platform.malloc(N+Nhalo); + o_rkrhsq = platform.malloc(Nrk*N); + o_rkerr = platform.malloc(N); + + o_saveq = platform.malloc(N); - o_saveq = platform.malloc(N*sizeof(dfloat)); + const int blocksize = 256; - Nblock = (N+BLOCKSIZE-1)/BLOCKSIZE; - errtmp = (dfloat*) platform.hostMalloc(Nblock*sizeof(dfloat), - NULL, h_errtmp); - o_errtmp = platform.malloc(Nblock*sizeof(dfloat)); + Nblock = (N+blocksize-1)/blocksize; + h_errtmp = platform.hostMalloc(Nblock); + o_errtmp = platform.malloc(Nblock); - hlong Nlocal = N; - hlong Ntotal; - MPI_Allreduce(&Nlocal, &Ntotal, 1, MPI_HLONG, MPI_SUM, comm); + hlong Ntotal = N; + comm.Allreduce(Ntotal); - occa::properties kernelInfo = platform.props; //copy base occa properties from solver + properties_t kernelInfo = platform.props(); //copy base occa properties from solver //add defines - kernelInfo["defines/" "p_blockSize"] = (int)BLOCKSIZE; + kernelInfo["defines/" "p_blockSize"] = (int)blocksize; rkUpdateKernel = platform.buildKernel(TIMESTEPPER_DIR "/okl/" "timeStepperDOPRI5.okl", @@ -84,16 +86,16 @@ dopri5::dopri5(dlong Nelements, dlong NhaloElements, 35.0/384.0, 0.0, 500.0/1113.0, 125.0/192.0, -2187.0/6784.0, 11.0/84.0, 0.0 }; dfloat _rkE[7] = {71.0/57600.0, 0.0, -71.0/16695.0, 71.0/1920.0, -17253.0/339200.0, 22.0/525.0, -1.0/40.0 }; - rkC = (dfloat*) calloc(Nrk, sizeof(dfloat)); - rkE = (dfloat*) calloc(Nrk, sizeof(dfloat)); - rkA = (dfloat*) calloc(Nrk*Nrk, sizeof(dfloat)); + rkC.malloc(Nrk); + rkE.malloc(Nrk); + rkA.malloc(Nrk*Nrk); - memcpy(rkC, _rkC, Nrk*sizeof(dfloat)); - memcpy(rkE, _rkE, Nrk*sizeof(dfloat)); - memcpy(rkA, _rkA, Nrk*Nrk*sizeof(dfloat)); + rkC.copyFrom(_rkC); + rkE.copyFrom(_rkE); + rkA.copyFrom(_rkA); - o_rkA = platform.malloc(Nrk*Nrk*sizeof(dfloat), rkA); - o_rkE = platform.malloc(Nrk*sizeof(dfloat), rkE); + o_rkA = platform.malloc(rkA); + o_rkE = platform.malloc(rkE); dtMIN = 1E-9; //minumum allowed timestep ATOL = 1E-6; //absolute error tolerance @@ -112,16 +114,16 @@ dopri5::dopri5(dlong Nelements, dlong NhaloElements, sqrtinvNtotal = 1.0/sqrt(Ntotal); } -void dopri5::Run(occa::memory &o_q, dfloat start, dfloat end) { +void dopri5::Run(solver_t& solver, deviceMemory &o_q, dfloat start, dfloat end) { dfloat time = start; // int rank; - // MPI_Comm_rank(comm, &rank); + // comm_rank_t(comm, &rank); solver.Report(time,0); - dfloat outputInterval; + dfloat outputInterval=0.0; solver.settings.getSetting("OUTPUT INTERVAL", outputInterval); dfloat outputTime = time + outputInterval; @@ -130,23 +132,17 @@ void dopri5::Run(occa::memory &o_q, dfloat start, dfloat end) { while (time < end) { - if (dtoutputTime) outputTime+= outputInterval; //catch up next output in case dt>outputInterval - facold = mymax(err,1E-4); // hard coded factor ? + constexpr dfloat errMax = 1.0e-4; // hard coded factor ? + facold = std::max(err,errMax); // if (!rank) // printf("\r time = %g (%d), dt = %g accepted ", time, allStep, dt); tstep++; } else { - dtnew = dt/(mymax(invfactor1,fac1/safe)); + dtnew = dt/(std::max(invfactor1,fac1/safe)); // if (!rank) // printf("\r time = %g (%d), dt = %g rejected, trying %g", time, allStep, dt, dtnew); @@ -222,19 +219,19 @@ void dopri5::Run(occa::memory &o_q, dfloat start, dfloat end) { // printf("%d accepted steps and %d total steps\n", tstep, allStep); } -void dopri5::Backup(occa::memory &o_Q) { - o_saveq.copyFrom(o_Q, N*sizeof(dfloat)); +void dopri5::Backup(deviceMemory &o_Q) { + o_saveq.copyFrom(o_Q, N); } -void dopri5::Restore(occa::memory &o_Q) { - o_saveq.copyTo(o_Q, N*sizeof(dfloat)); +void dopri5::Restore(deviceMemory &o_Q) { + o_saveq.copyTo(o_Q, N); } -void dopri5::AcceptStep(occa::memory &o_q, occa::memory &o_rq) { - o_q.copyFrom(o_rq, N*sizeof(dfloat)); +void dopri5::AcceptStep(deviceMemory &o_q, deviceMemory &o_rq) { + o_q.copyFrom(o_rq, N); } -void dopri5::Step(occa::memory &o_q, dfloat time, dfloat _dt) { +void dopri5::Step(solver_t& solver, deviceMemory &o_q, dfloat time, dfloat _dt) { //RK step for(int rk=0;rk& o_q){ //Error estimation //E. HAIRER, S.P. NORSETT AND G. WANNER, SOLVING ORDINARY @@ -286,63 +283,44 @@ dfloat dopri5::Estimater(occa::memory& o_q){ o_rkerr, o_errtmp); - o_errtmp.copyTo(errtmp); - dfloat localerr = 0; + h_errtmp.copyFrom(o_errtmp); dfloat err = 0; for(dlong n=0;n pmlq(Npml,0.0); + o_pmlq = platform.malloc(pmlq); - dfloat *pmlq = (dfloat *) calloc(Npml,sizeof(dfloat)); - o_pmlq = platform.malloc(Npml*sizeof(dfloat), pmlq); - free(pmlq); + o_rhspmlq = platform.malloc(Npml); + o_rkpmlq = platform.malloc(Npml); + o_rkrhspmlq = platform.malloc(Nrk*Npml); - o_rhspmlq = platform.malloc(Npml*sizeof(dfloat)); - o_rkpmlq = platform.malloc(Npml*sizeof(dfloat)); - o_rkrhspmlq = platform.malloc(Nrk*Npml*sizeof(dfloat)); + o_savepmlq = platform.malloc(Npml); - o_savepmlq = platform.malloc(Npml*sizeof(dfloat)); + properties_t kernelInfo = platform.props(); //copy base occa properties from solver - occa::properties kernelInfo = platform.props; //copy base occa properties from solver + const int blocksize = 256; //add defines - kernelInfo["defines/" "p_blockSize"] = (int)BLOCKSIZE; + kernelInfo["defines/" "p_blockSize"] = (int)blocksize; rkPmlUpdateKernel = platform.buildKernel(TIMESTEPPER_DIR "/okl/" "timeStepperDOPRI5.okl", @@ -351,25 +329,25 @@ dopri5_pml::dopri5_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElements, } } -void dopri5_pml::Backup(occa::memory &o_Q) { - o_saveq.copyFrom(o_Q, N*sizeof(dfloat)); +void dopri5_pml::Backup(deviceMemory &o_Q) { + o_saveq.copyFrom(o_Q, N); if (Npml) - o_savepmlq.copyFrom(o_rkpmlq, Npml*sizeof(dfloat)); + o_savepmlq.copyFrom(o_rkpmlq, Npml); } -void dopri5_pml::Restore(occa::memory &o_Q) { - o_saveq.copyTo(o_Q, N*sizeof(dfloat)); +void dopri5_pml::Restore(deviceMemory &o_Q) { + o_saveq.copyTo(o_Q, N); if (Npml) - o_savepmlq.copyTo(o_rkpmlq, Npml*sizeof(dfloat)); + o_savepmlq.copyTo(o_rkpmlq, Npml); } -void dopri5_pml::AcceptStep(occa::memory &o_q, occa::memory &o_rq) { - o_q.copyFrom(o_rq, N*sizeof(dfloat)); +void dopri5_pml::AcceptStep(deviceMemory &o_q, deviceMemory &o_rq) { + o_q.copyFrom(o_rq, N); if (Npml) - o_pmlq.copyFrom(o_rkpmlq, Npml*sizeof(dfloat)); + o_pmlq.copyFrom(o_rkpmlq, Npml); } -void dopri5_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt) { +void dopri5_pml::Step(solver_t& solver, deviceMemory &o_q, dfloat time, dfloat _dt) { //RK step for(int rk=0;rk qn(Nstages*N, 0.0); + o_qn = platform.malloc(qn); //q history - dfloat *rhs = (dfloat *) calloc(N,sizeof(dfloat)); - o_rhs = platform.malloc(N*sizeof(dfloat), rhs); //rhs storage - free(rhs); + memory rhs(N,0.0); + o_rhs = platform.malloc(rhs); //rhs storage - o_F = platform.malloc(Nstages*N*sizeof(dfloat), qn); //F(q) history (explicit part) - free(qn); + o_F = platform.malloc(qn); //F(q) history (explicit part) - occa::properties kernelInfo = platform.props; //copy base occa properties from solver + properties_t kernelInfo = platform.props(); //copy base occa properties from solver - kernelInfo["defines/" "p_blockSize"] = BLOCKSIZE; + const int blocksize=256; + + kernelInfo["defines/" "p_blockSize"] = blocksize; kernelInfo["defines/" "p_Nstages"] = Nstages; rhsKernel = platform.buildKernel(TIMESTEPPER_DIR "/okl/" @@ -69,26 +71,26 @@ extbdf3::extbdf3(dlong Nelements, dlong NhaloElements, 3./2., 2., -1./2., 0., 11./6., 3., -3./2., 1./3.}; - extbdf_a = (dfloat*) calloc(Nstages*Nstages, sizeof(dfloat)); - extbdf_b = (dfloat*) calloc(Nstages*(Nstages+1), sizeof(dfloat)); - memcpy(extbdf_a, _a, Nstages*Nstages*sizeof(dfloat)); - memcpy(extbdf_b, _b, Nstages*(Nstages+1)*sizeof(dfloat)); + extbdf_a.malloc(Nstages*Nstages); + extbdf_b.malloc(Nstages*(Nstages+1)); + extbdf_a.copyFrom(_a); + extbdf_b.copyFrom(_b); - o_extbdf_a = platform.malloc(Nstages*Nstages*sizeof(dfloat), extbdf_a); - o_extbdf_b = platform.malloc(Nstages*(Nstages+1)*sizeof(dfloat), extbdf_b); + o_extbdf_a = platform.malloc(extbdf_a); + o_extbdf_b = platform.malloc(extbdf_b); } -dfloat extbdf3::getGamma() { - return *(extbdf_b + (Nstages-1)*(Nstages+1)); //first entry of last row of B +dfloat extbdf3::GetGamma() { + return extbdf_b[(Nstages-1)*(Nstages+1)]; //first entry of last row of B } -void extbdf3::Run(occa::memory &o_q, dfloat start, dfloat end) { +void extbdf3::Run(solver_t& solver, deviceMemory &o_q, dfloat start, dfloat end) { dfloat time = start; solver.Report(time,0); - dfloat outputInterval; + dfloat outputInterval=0.0; solver.settings.getSetting("OUTPUT INTERVAL", outputInterval); dfloat outputTime = time + outputInterval; @@ -96,7 +98,7 @@ void extbdf3::Run(occa::memory &o_q, dfloat start, dfloat end) { int tstep=0; int order=0; while (time < end) { - Step(o_q, time, dt, order); + Step(solver, o_q, time, dt, order); time += dt; tstep++; if (order &o_q, dfloat time, dfloat _dt, int order) { //F(q) at current index - occa::memory o_F0 = o_F + shiftIndex*N*sizeof(dfloat); + deviceMemory o_F0 = o_F + shiftIndex*N; //coefficients at current order - occa::memory o_A = o_extbdf_a + order*Nstages*sizeof(dfloat); - occa::memory o_B = o_extbdf_b + order*(Nstages+1)*sizeof(dfloat); - dfloat *B = extbdf_b + order*(Nstages+1); + deviceMemory o_A = o_extbdf_a + order*Nstages; + deviceMemory o_B = o_extbdf_b + order*(Nstages+1); + memory B = extbdf_b + order*(Nstages+1); //evaluate explicit part of rhs: F(q,t) solver.rhs_imex_f(o_q, o_F0, time); @@ -143,17 +145,6 @@ void extbdf3::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) { shiftIndex = (shiftIndex+Nstages-1)%Nstages; } -extbdf3::~extbdf3() { - if (o_rhs.size()) o_rhs.free(); - if (o_qn.size()) o_qn.free(); - if (o_F.size()) o_F.free(); - if (o_extbdf_a.size()) o_extbdf_a.free(); - if (o_extbdf_b.size()) o_extbdf_b.free(); - - if (extbdf_a) free(extbdf_a); - if (extbdf_b) free(extbdf_b); - - rhsKernel.free(); -} - } //namespace TimeStepper + +} //namespace libp diff --git a/libs/timeStepper/timeStepperLSERK4.cpp b/libs/timeStepper/timeStepperLSERK4.cpp index c7ebdea74..2ffa4e4eb 100644 --- a/libs/timeStepper/timeStepperLSERK4.cpp +++ b/libs/timeStepper/timeStepperLSERK4.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,22 +27,28 @@ SOFTWARE. #include "core.hpp" #include "timeStepper.hpp" +namespace libp { + namespace TimeStepper { lserk4::lserk4(dlong Nelements, dlong NhaloElements, - int Np, int Nfields, solver_t& _solver): - timeStepper_t(Nelements, NhaloElements, Np, Nfields, _solver) { - - platform_t &platform = solver.platform; + int Np, int Nfields, + platform_t& _platform, comm_t _comm): + timeStepperBase_t(Nelements, NhaloElements, Np, Nfields, + _platform, _comm) { Nrk = 5; - o_resq = platform.malloc(N*sizeof(dfloat)); - o_rhsq = platform.malloc(N*sizeof(dfloat)); + o_resq = platform.malloc(N); + o_rhsq = platform.malloc(N); + + o_saveq = platform.malloc(N); - occa::properties kernelInfo = platform.props; //copy base occa properties from solver + properties_t kernelInfo = platform.props(); //copy base occa properties from solver - kernelInfo["defines/" "p_blockSize"] = BLOCKSIZE; + const int blocksize=256; + + kernelInfo["defines/" "p_blockSize"] = blocksize; updateKernel = platform.buildKernel(TIMESTEPPER_DIR "/okl/" "timeStepperLSERK4.okl", @@ -68,23 +74,21 @@ lserk4::lserk4(dlong Nelements, dlong NhaloElements, 2802321613138.0/2924317926251.0 , 1.0}; - rka = (dfloat*) calloc(Nrk, sizeof(dfloat)); - rkb = (dfloat*) calloc(Nrk, sizeof(dfloat)); - rkc = (dfloat*) calloc(Nrk+1, sizeof(dfloat)); - memcpy(rka, _rka, Nrk*sizeof(dfloat)); - memcpy(rkb, _rkb, Nrk*sizeof(dfloat)); - memcpy(rkc, _rkc, (Nrk+1)*sizeof(dfloat)); + rka.malloc(Nrk); + rkb.malloc(Nrk); + rkc.malloc(Nrk+1); + rka.copyFrom(_rka); + rkb.copyFrom(_rkb); + rkc.copyFrom(_rkc); } -void lserk4::Run(occa::memory &o_q, dfloat start, dfloat end) { - - platform_t &platform = solver.platform; +void lserk4::Run(solver_t& solver, deviceMemory &o_q, dfloat start, dfloat end) { dfloat time = start; solver.Report(time,0); - dfloat outputInterval; + dfloat outputInterval=0.0; solver.settings.getSetting("OUTPUT INTERVAL", outputInterval); dfloat outputTime = time + outputInterval; @@ -96,20 +100,18 @@ void lserk4::Run(occa::memory &o_q, dfloat start, dfloat end) { if (time=outputTime) { //save current state - occa::memory o_saveq = platform.malloc(N*sizeof(dfloat)); - o_saveq.copyFrom(o_q, N*sizeof(dfloat)); + o_saveq.copyFrom(o_q, N); stepdt = outputTime-time; //take small time step - Step(o_q, time, stepdt); + Step(solver, o_q, time, stepdt); //report state solver.Report(outputTime,tstep); //restore previous state - o_q.copyFrom(o_saveq, N*sizeof(dfloat)); - o_saveq.free(); + o_q.copyFrom(o_saveq, N); outputTime += outputInterval; } @@ -121,13 +123,13 @@ void lserk4::Run(occa::memory &o_q, dfloat start, dfloat end) { stepdt = dt; } - Step(o_q, time, stepdt); + Step(solver, o_q, time, stepdt); time += stepdt; tstep++; } } -void lserk4::Step(occa::memory &o_q, dfloat time, dfloat _dt) { +void lserk4::Step(solver_t& solver, deviceMemory &o_q, dfloat time, dfloat _dt) { // Low storage explicit Runge Kutta (5 stages, 4th order) for(int rk=0;rk pmlq(Npml,0.0); + o_pmlq = platform.malloc(pmlq); - o_respmlq = platform.malloc(Npml*sizeof(dfloat)); - o_rhspmlq = platform.malloc(Npml*sizeof(dfloat)); + o_respmlq = platform.malloc(Npml); + o_rhspmlq = platform.malloc(Npml); } } -void lserk4_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt) { +void lserk4_pml::Step(solver_t& solver, deviceMemory &o_q, dfloat time, dfloat _dt) { // Low storage explicit Runge Kutta (5 stages, 4th order) for(int rk=0;rk rhsq0(N, 0.0); + o_rhsq0 = platform.malloc(rhsq0); + + memory rhsq((Nstages-1)*N, 0.0); + o_rhsq = platform.malloc(rhsq); - dfloat *rhsq = (dfloat*) calloc((Nstages-1)*N, sizeof(dfloat)); - o_rhsq = platform.malloc((Nstages-1)*N*sizeof(dfloat), rhsq); - free(rhsq); + o_fQM = platform.malloc((mesh.Nelements+mesh.totalHaloPairs)*mesh.Nfp + *mesh.Nfaces*Nfields); - o_fQM = platform.malloc((mesh.Nelements+mesh.totalHaloPairs)*mesh.Nfp - *mesh.Nfaces*Nfields*sizeof(dfloat)); + properties_t kernelInfo = platform.props(); //copy base occa properties from solver - occa::properties kernelInfo = platform.props; //copy base occa properties from solver + const int blocksize=256; - kernelInfo["defines/" "p_blockSize"] = BLOCKSIZE; + kernelInfo["defines/" "p_blockSize"] = blocksize; kernelInfo["defines/" "p_Nstages"] = Nstages; kernelInfo["defines/" "p_Np"] = mesh.Np; kernelInfo["defines/" "p_Nfp"] = mesh.Nfp; kernelInfo["defines/" "p_Nfaces"] = mesh.Nfaces; kernelInfo["defines/" "p_Nfields"] = Nfields; - int maxNodes = mymax(mesh.Np, mesh.Nfp*mesh.Nfaces); + int maxNodes = std::max(mesh.Np, mesh.Nfp*mesh.Nfaces); kernelInfo["defines/" "p_maxNodes"] = maxNodes; updateKernel = platform.buildKernel(TIMESTEPPER_DIR "/okl/" @@ -81,37 +83,36 @@ mrab3::mrab3(dlong Nelements, dlong NhaloElements, 5./8., -1./8., 0.0, 17./24., -7./24., 2./24.}; - ab_a = (dfloat*) calloc(Nstages*Nstages, sizeof(dfloat)); - ab_b = (dfloat*) calloc(Nstages*Nstages, sizeof(dfloat)); - memcpy(ab_a, _ab_a, Nstages*Nstages*sizeof(dfloat)); - memcpy(ab_b, _ab_b, Nstages*Nstages*sizeof(dfloat)); + ab_a.malloc(Nstages*Nstages); + ab_b.malloc(Nstages*Nstages); + ab_a.copyFrom(_ab_a); + ab_b.copyFrom(_ab_b); - shiftIndex = (int*) platform.hostMalloc(Nlevels*sizeof(int), - NULL, h_shiftIndex); - o_shiftIndex = platform.malloc(Nlevels*sizeof(int)); + h_shiftIndex = platform.hostMalloc(Nlevels); + o_shiftIndex = platform.malloc(Nlevels); - mrdt = (dfloat*) calloc(Nlevels, sizeof(dfloat)); - o_mrdt = platform.malloc(Nlevels*sizeof(dfloat), mrdt); + mrdt.malloc(Nlevels, 0.0); + o_mrdt = platform.malloc(mrdt); - o_ab_a = platform.malloc(Nstages*Nstages*sizeof(dfloat), ab_a); - o_ab_b = platform.malloc(Nstages*Nstages*sizeof(dfloat), ab_b); + o_ab_a = platform.malloc(ab_a); + o_ab_b = platform.malloc(ab_b); } -void mrab3::Run(occa::memory &o_q, dfloat start, dfloat end) { +void mrab3::Run(solver_t& solver, deviceMemory &o_q, dfloat start, dfloat end) { dfloat time = start; //set timesteps and shifting index for (int lev=0;lev &o_q, dfloat time, dfloat _dt, int order) { - occa::memory o_A = o_ab_a+order*Nstages*sizeof(dfloat); - occa::memory o_B = o_ab_b+order*Nstages*sizeof(dfloat); + deviceMemory o_A = o_ab_a+order*Nstages; + deviceMemory o_B = o_ab_b+order*Nstages; for (int Ntick=0; Ntick < (1 << (Nlevels-1));Ntick++) { @@ -186,7 +187,7 @@ void mrab3::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) { //rotate index if (Nstages>2) for (int l=0; l<=lev; l++) - shiftIndex[l] = (shiftIndex[l]+Nstages-2)%(Nstages-1); + h_shiftIndex[l] = (h_shiftIndex[l]+Nstages-2)%(Nstages-1); //compute intermediate trace values on lev+1 / lev interface if (lev+1 pmlq(Npml, 0.0); + o_pmlq = platform.malloc(pmlq); - dfloat *pmlq = (dfloat*) calloc(Npml, sizeof(dfloat)); - o_pmlq = platform.malloc(Npml*sizeof(dfloat), pmlq); - free(pmlq); + memory rhspmlq0(Npml, 0.0); + o_rhspmlq0 = platform.malloc(rhspmlq0); - dfloat *rhspmlq0 = (dfloat*) calloc(Npml, sizeof(dfloat)); - o_rhspmlq0 = platform.malloc(Npml*sizeof(dfloat), rhspmlq0); - free(rhspmlq0); + memory rhspmlq((Nstages-1)*Npml, 0.0); + o_rhspmlq = platform.malloc(rhspmlq); - dfloat *rhspmlq = (dfloat*) calloc((Nstages-1)*Npml, sizeof(dfloat)); - o_rhspmlq = platform.malloc((Nstages-1)*Npml*sizeof(dfloat), rhspmlq); - free(rhspmlq); + properties_t kernelInfo = platform.props(); //copy base occa properties from solver - occa::properties kernelInfo = platform.props; //copy base occa properties from solver + const int blocksize=256; - kernelInfo["defines/" "p_blockSize"] = BLOCKSIZE; + kernelInfo["defines/" "p_blockSize"] = blocksize; kernelInfo["defines/" "p_Nstages"] = Nstages; kernelInfo["defines/" "p_Np"] = mesh.Np; kernelInfo["defines/" "p_Nfp"] = mesh.Nfp; kernelInfo["defines/" "p_Nfaces"] = mesh.Nfaces; kernelInfo["defines/" "p_Nfields"] = Nfields; - int maxNodes = mymax(mesh.Np, mesh.Nfp*mesh.Nfaces); + int maxNodes = std::max(mesh.Np, mesh.Nfp*mesh.Nfaces); kernelInfo["defines/" "p_maxNodes"] = maxNodes; pmlUpdateKernel = platform.buildKernel(TIMESTEPPER_DIR "/okl/" @@ -263,10 +252,10 @@ mrab3_pml::mrab3_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElements, } } -void mrab3_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) { +void mrab3_pml::Step(solver_t& solver, deviceMemory &o_q, dfloat time, dfloat _dt, int order) { - occa::memory o_A = o_ab_a+order*Nstages*sizeof(dfloat); - occa::memory o_B = o_ab_b+order*Nstages*sizeof(dfloat); + deviceMemory o_A = o_ab_a+order*Nstages; + deviceMemory o_B = o_ab_b+order*Nstages; for (int Ntick=0; Ntick < (1 << (Nlevels-1));Ntick++) { @@ -317,7 +306,7 @@ void mrab3_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) { //rotate index if (Nstages>2) for (int l=0; l<=lev; l++) - shiftIndex[l] = (shiftIndex[l]+Nstages-2)%(Nstages-1); + h_shiftIndex[l] = (h_shiftIndex[l]+Nstages-2)%(Nstages-1); //compute intermediate trace values on lev+1 / lev interface if (lev+1 +namespace libp { + namespace TimeStepper { using std::complex; mrsaab3::mrsaab3(dlong _Nelements, dlong _NhaloElements, int _Np, int _Nfields, - dfloat *_lambda, solver_t& _solver, mesh_t& _mesh): - timeStepper_t(_Nelements, _NhaloElements, _Np, _Nfields, _solver), + memory _lambda, + platform_t& _platform, mesh_t& _mesh): + timeStepperBase_t(_Nelements, _NhaloElements, _Np, _Nfields, + _platform, _mesh.comm), mesh(_mesh), Nlevels(mesh.mrNlevels), Nfields(_Nfields) { - platform_t &platform = solver.platform; - - lambda = (dfloat *) malloc(Nfields*sizeof(dfloat)); - memcpy(lambda, _lambda, Nfields*sizeof(dfloat)); + lambda.malloc(Nfields); + lambda.copyFrom(_lambda); Nstages = 3; - dfloat *rhsq0 = (dfloat*) calloc(N, sizeof(dfloat)); - o_rhsq0 = platform.malloc(N*sizeof(dfloat), rhsq0); - free(rhsq0); + memory rhsq0(N, 0.0); + o_rhsq0 = platform.malloc(rhsq0); + + memory rhsq((Nstages-1)*N, 0.0); + o_rhsq = platform.malloc(rhsq); - dfloat *rhsq = (dfloat*) calloc((Nstages-1)*N, sizeof(dfloat)); - o_rhsq = platform.malloc((Nstages-1)*N*sizeof(dfloat), rhsq); - free(rhsq); + o_fQM = platform.malloc((mesh.Nelements+mesh.totalHaloPairs)*mesh.Nfp + *mesh.Nfaces*Nfields); - o_fQM = platform.malloc((mesh.Nelements+mesh.totalHaloPairs)*mesh.Nfp - *mesh.Nfaces*Nfields*sizeof(dfloat)); + properties_t kernelInfo = platform.props(); //copy base occa properties from solver - occa::properties kernelInfo = platform.props; //copy base occa properties from solver + const int blocksize=256; - kernelInfo["defines/" "p_blockSize"] = BLOCKSIZE; + kernelInfo["defines/" "p_blockSize"] = blocksize; kernelInfo["defines/" "p_Nstages"] = Nstages; kernelInfo["defines/" "p_Np"] = mesh.Np; kernelInfo["defines/" "p_Nfp"] = mesh.Nfp; kernelInfo["defines/" "p_Nfaces"] = mesh.Nfaces; kernelInfo["defines/" "p_Nfields"] = Nfields; - int maxNodes = mymax(mesh.Np, mesh.Nfp*mesh.Nfaces); + int maxNodes = std::max(mesh.Np, mesh.Nfp*mesh.Nfaces); kernelInfo["defines/" "p_maxNodes"] = maxNodes; updateKernel = platform.buildKernel(TIMESTEPPER_DIR "/okl/" @@ -78,37 +80,36 @@ mrsaab3::mrsaab3(dlong _Nelements, dlong _NhaloElements, "mrsaabTraceUpdate", kernelInfo); - saab_x = (dfloat*) calloc(Nlevels*Nfields, sizeof(dfloat)); - saab_a = (dfloat*) calloc(Nlevels*Nfields*Nstages*Nstages, sizeof(dfloat)); - saab_b = (dfloat*) calloc(Nlevels*Nfields*Nstages*Nstages, sizeof(dfloat)); + saab_x.malloc(Nlevels*Nfields); + saab_a.malloc(Nlevels*Nfields*Nstages*Nstages); + saab_b.malloc(Nlevels*Nfields*Nstages*Nstages); - shiftIndex = (int*) platform.hostMalloc(Nlevels*sizeof(int), - NULL, h_shiftIndex); - o_shiftIndex = platform.malloc(Nlevels*sizeof(int)); + h_shiftIndex = platform.hostMalloc(Nlevels); + o_shiftIndex = platform.malloc(Nlevels); - mrdt = (dfloat*) calloc(Nlevels, sizeof(dfloat)); - o_mrdt = platform.malloc(Nlevels*sizeof(dfloat), mrdt); + mrdt.malloc(Nlevels, 0.0); + o_mrdt = platform.malloc(mrdt); - o_saab_x = platform.malloc(Nlevels*Nfields*sizeof(dfloat)); - o_saab_a = platform.malloc(Nlevels*Nfields*Nstages*Nstages*sizeof(dfloat)); - o_saab_b = platform.malloc(Nlevels*Nfields*Nstages*Nstages*sizeof(dfloat)); + o_saab_x = platform.malloc(Nlevels*Nfields); + o_saab_a = platform.malloc(Nlevels*Nfields*Nstages*Nstages); + o_saab_b = platform.malloc(Nlevels*Nfields*Nstages*Nstages); } -void mrsaab3::Run(occa::memory &o_q, dfloat start, dfloat end) { +void mrsaab3::Run(solver_t& solver, deviceMemory &o_q, dfloat start, dfloat end) { dfloat time = start; //set timesteps and shifting index for (int lev=0;lev &o_q, dfloat time, dfloat _dt, int order) { - occa::memory o_A = o_saab_a+order*Nstages*sizeof(dfloat); - occa::memory o_B = o_saab_b+order*Nstages*sizeof(dfloat); + deviceMemory o_A = o_saab_a+order*Nstages; + deviceMemory o_B = o_saab_b+order*Nstages; for (int Ntick=0; Ntick < (1 << (Nlevels-1));Ntick++) { @@ -188,7 +189,7 @@ void mrsaab3::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) { //rotate index if (Nstages>2) for (int l=0; l<=lev; l++) - shiftIndex[l] = (shiftIndex[l]+Nstages-2)%(Nstages-1); + h_shiftIndex[l] = (h_shiftIndex[l]+Nstages-2)%(Nstages-1); //compute intermediate trace values on lev+1 / lev interface if (lev+1 _lambda, + platform_t& _platform, mesh_t& _mesh): + mrsaab3(Nelements, NhaloElements, + Np, _Nfields, _lambda, _platform, _mesh), Npml(NpmlElements*Np*_Npmlfields), Npmlfields(_Npmlfields) { if (Npml) { - platform_t &platform = solver.platform; + memory pmlq(Npml, 0.0); + o_pmlq = platform.malloc(pmlq); - dfloat *pmlq = (dfloat*) calloc(Npml, sizeof(dfloat)); - o_pmlq = platform.malloc(Npml*sizeof(dfloat), pmlq); - free(pmlq); + memory rhspmlq0(Npml, 0.0); + o_rhspmlq0 = platform.malloc(rhspmlq0); - dfloat *rhspmlq0 = (dfloat*) calloc(Npml, sizeof(dfloat)); - o_rhspmlq0 = platform.malloc(Npml*sizeof(dfloat), rhspmlq0); - free(rhspmlq0); + memory rhspmlq((Nstages-1)*Npml, 0.0); + o_rhspmlq = platform.malloc(rhspmlq); - dfloat *rhspmlq = (dfloat*) calloc((Nstages-1)*Npml, sizeof(dfloat)); - o_rhspmlq = platform.malloc((Nstages-1)*Npml*sizeof(dfloat), rhspmlq); - free(rhspmlq); + properties_t kernelInfo = platform.props(); //copy base occa properties from solver - occa::properties kernelInfo = platform.props; //copy base occa properties from solver + const int blocksize=256; - kernelInfo["defines/" "p_blockSize"] = BLOCKSIZE; + kernelInfo["defines/" "p_blockSize"] = blocksize; kernelInfo["defines/" "p_Nstages"] = Nstages; kernelInfo["defines/" "p_Np"] = mesh.Np; kernelInfo["defines/" "p_Nfp"] = mesh.Nfp; kernelInfo["defines/" "p_Nfaces"] = mesh.Nfaces; kernelInfo["defines/" "p_Nfields"] = Nfields; - int maxNodes = mymax(mesh.Np, mesh.Nfp*mesh.Nfaces); + int maxNodes = std::max(mesh.Np, mesh.Nfp*mesh.Nfaces); kernelInfo["defines/" "p_maxNodes"] = maxNodes; pmlUpdateKernel = platform.buildKernel(TIMESTEPPER_DIR "/okl/" @@ -398,23 +376,23 @@ mrsaab3_pml::mrsaab3_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElement 5./8., -1./8., 0.0, 17./24., -7./24., 2./24.}; - pmlsaab_a = (dfloat*) calloc(Nstages*Nstages, sizeof(dfloat)); - pmlsaab_b = (dfloat*) calloc(Nstages*Nstages, sizeof(dfloat)); - memcpy(pmlsaab_a, _ab_a, Nstages*Nstages*sizeof(dfloat)); - memcpy(pmlsaab_b, _ab_b, Nstages*Nstages*sizeof(dfloat)); + pmlsaab_a.malloc(Nstages*Nstages); + pmlsaab_b.malloc(Nstages*Nstages); + pmlsaab_a.copyFrom(_ab_a); + pmlsaab_b.copyFrom(_ab_b); - o_pmlsaab_a = platform.malloc(Nstages*Nstages*sizeof(dfloat), pmlsaab_a); - o_pmlsaab_b = platform.malloc(Nstages*Nstages*sizeof(dfloat), pmlsaab_b); + o_pmlsaab_a = platform.malloc(pmlsaab_a); + o_pmlsaab_b = platform.malloc(pmlsaab_b); } } -void mrsaab3_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) { +void mrsaab3_pml::Step(solver_t& solver, deviceMemory &o_q, dfloat time, dfloat _dt, int order) { - occa::memory o_A = o_saab_a+order*Nstages*sizeof(dfloat); - occa::memory o_B = o_saab_b+order*Nstages*sizeof(dfloat); + deviceMemory o_A = o_saab_a+order*Nstages; + deviceMemory o_B = o_saab_b+order*Nstages; - occa::memory o_pmlA; - if (Npml) o_pmlA = o_pmlsaab_a+order*Nstages*sizeof(dfloat); + deviceMemory o_pmlA; + if (Npml) o_pmlA = o_pmlsaab_a+order*Nstages; for (int Ntick=0; Ntick < (1 << (Nlevels-1));Ntick++) { @@ -466,7 +444,7 @@ void mrsaab3_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) { //rotate index if (Nstages>2) for (int l=0; l<=lev; l++) - shiftIndex[l] = (shiftIndex[l]+Nstages-2)%(Nstages-1); + h_shiftIndex[l] = (h_shiftIndex[l]+Nstages-2)%(Nstages-1); //compute intermediate trace values on lev+1 / lev interface if (lev+1 +namespace libp { + namespace TimeStepper { using std::complex; saab3::saab3(dlong _Nelements, dlong _NhaloElements, int _Np, int _Nfields, - dfloat *_lambda, solver_t& _solver): - timeStepper_t(_Nelements, _NhaloElements, _Np, _Nfields, _solver), + memory _lambda, + platform_t& _platform, comm_t _comm): + timeStepperBase_t(_Nelements, _NhaloElements, + _Np, _Nfields, _platform, _comm), Np(_Np), Nfields(_Nfields), Nelements(_Nelements), NhaloElements(_NhaloElements) { - platform_t &platform = solver.platform; - - lambda = (dfloat *) malloc(Nfields*sizeof(dfloat)); - memcpy(lambda, _lambda, Nfields*sizeof(dfloat)); + lambda.malloc(Nfields); + lambda.copyFrom(_lambda); Nstages = 3; shiftIndex = 0; - o_rhsq = platform.malloc(Nstages*N*sizeof(dfloat)); + o_rhsq = platform.malloc(Nstages*N); + + const int blocksize=256; - occa::properties kernelInfo = platform.props; //copy base occa properties from solver + properties_t kernelInfo = platform.props(); //copy base occa properties from solver - kernelInfo["defines/" "p_blockSize"] = BLOCKSIZE; + kernelInfo["defines/" "p_blockSize"] = blocksize; kernelInfo["defines/" "p_Nstages"] = Nstages; kernelInfo["defines/" "p_Np"] = (int)Np; kernelInfo["defines/" "p_Nfields"] = (int)Nfields; @@ -63,20 +67,20 @@ saab3::saab3(dlong _Nelements, dlong _NhaloElements, "saabUpdate", kernelInfo); - saab_x = (dfloat*) malloc(Nfields*sizeof(dfloat)); - o_saab_x = platform.malloc(Nfields*sizeof(dfloat)); + h_saab_x = platform.hostMalloc(Nfields); + o_saab_x = platform.malloc(Nfields); - saab_a = (dfloat*) malloc(Nfields*Nstages*Nstages*sizeof(dfloat)); - o_saab_a = platform.malloc(Nfields*Nstages*Nstages*sizeof(dfloat)); + h_saab_a = platform.hostMalloc(Nfields*Nstages*Nstages); + o_saab_a = platform.malloc(Nfields*Nstages*Nstages); } -void saab3::Run(occa::memory &o_q, dfloat start, dfloat end) { +void saab3::Run(solver_t& solver, deviceMemory &o_q, dfloat start, dfloat end) { dfloat time = start; solver.Report(time,0); - dfloat outputInterval; + dfloat outputInterval=0.0; solver.settings.getSetting("OUTPUT INTERVAL", outputInterval); dfloat outputTime = time + outputInterval; @@ -87,7 +91,7 @@ void saab3::Run(occa::memory &o_q, dfloat start, dfloat end) { int tstep=0; int order=0; while (time < end) { - Step(o_q, time, dt, order); + Step(solver, o_q, time, dt, order); time += dt; tstep++; if (order &o_q, dfloat time, dfloat _dt, int order) { //rhs at current index - occa::memory o_rhsq0 = o_rhsq + shiftIndex*N*sizeof(dfloat); + deviceMemory o_rhsq0 = o_rhsq + shiftIndex*N; //coefficients at current order - occa::memory o_X = o_saab_x; - occa::memory o_A = o_saab_a + order*Nstages*sizeof(dfloat); + deviceMemory o_X = o_saab_x; + deviceMemory o_A = o_saab_a + order*Nstages; //evaluate ODE rhs = f(q,t) solver.rhsf(o_q, o_rhsq0, time); @@ -147,8 +151,8 @@ void saab3::UpdateCoefficients() { 3./2., -1./2., 0.0, 23./12., -16./12., 5./12.}; - memcpy(saab_x+n ,_saab_X, 1*sizeof(dfloat)); - memcpy(saab_a+n*Nstages*Nstages,_saab_A,Nstages*Nstages*sizeof(dfloat)); + h_saab_x.copyFrom(_saab_X, 1, n ); + h_saab_a.copyFrom(_saab_A, Nstages*Nstages, n*Nstages*Nstages); } else { @@ -186,30 +190,22 @@ void saab3::UpdateCoefficients() { dfloat aa32=real(a32)/ (double) Nr; dfloat aa33=real(a33)/ (double) Nr; - dfloat _saab_X[1] = { exp(alpha) }; + dfloat _saab_X[1] = { std::exp(alpha) }; dfloat _saab_A[Nstages*Nstages] ={ aa11, 0.0, 0.0, aa21, aa22, 0.0, aa31, aa32, aa33 }; - memcpy(saab_x+n ,_saab_X, 1*sizeof(dfloat)); - memcpy(saab_a+n*Nstages*Nstages,_saab_A,Nstages*Nstages*sizeof(dfloat)); + h_saab_x.copyFrom(_saab_X, 1, n ); + h_saab_a.copyFrom(_saab_A, Nstages*Nstages, n*Nstages*Nstages); } // move data to platform - o_saab_x.copyFrom(saab_x); - o_saab_a.copyFrom(saab_a); + h_saab_x.copyTo(o_saab_x); + h_saab_a.copyTo(o_saab_a); } } -saab3::~saab3() { - if (o_rhsq.size()) o_rhsq.free(); - if (o_saab_x.size()) o_saab_x.free(); - if (o_saab_a.size()) o_saab_a.free(); - - updateKernel.free(); -} - /**************************************************/ /* PML version */ @@ -217,22 +213,22 @@ saab3::~saab3() { saab3_pml::saab3_pml(dlong _Nelements, dlong _NpmlElements, dlong _NhaloElements, int _Np, int _Nfields, int Npmlfields, - dfloat *_lambda, solver_t& _solver): - saab3(_Nelements, _NhaloElements, _Np, _Nfields, _lambda, _solver), + memory _lambda, + platform_t& _platform, comm_t _comm): + saab3(_Nelements, _NhaloElements, _Np, _Nfields, _lambda, _platform, _comm), Npml(Npmlfields*_Np*_NpmlElements) { if (Npml) { - platform_t &platform = solver.platform; + memory pmlq(Npml,0.0); + o_pmlq = platform.malloc(pmlq); - dfloat *pmlq = (dfloat *) calloc(Npml,sizeof(dfloat)); - o_pmlq = platform.malloc(Npml*sizeof(dfloat), pmlq); - free(pmlq); + o_rhspmlq = platform.malloc(Nstages*Npml); - o_rhspmlq = platform.malloc(Nstages*Npml*sizeof(dfloat)); + properties_t kernelInfo = platform.props(); //copy base occa properties from solver - occa::properties kernelInfo = platform.props; //copy base occa properties from solver + const int blocksize=256; - kernelInfo["defines/" "p_blockSize"] = BLOCKSIZE; + kernelInfo["defines/" "p_blockSize"] = blocksize; kernelInfo["defines/" "p_Nstages"] = Nstages; kernelInfo["defines/" "p_Np"] = (int)Np; kernelInfo["defines/" "p_Nfields"] = (int)Nfields; @@ -248,29 +244,29 @@ saab3_pml::saab3_pml(dlong _Nelements, dlong _NpmlElements, dlong _NhaloElements 3./2., -1./2., 0.0, 23./12., -16./12., 5./12.}; - pmlsaab_a = (dfloat*) calloc(Nstages*Nstages, sizeof(dfloat)); - memcpy(pmlsaab_a, _ab_a, Nstages*Nstages*sizeof(dfloat)); + pmlsaab_a.malloc(Nstages*Nstages); + pmlsaab_a.copyFrom(_ab_a); - o_pmlsaab_a = platform.malloc(Nstages*Nstages*sizeof(dfloat), pmlsaab_a); + o_pmlsaab_a = platform.malloc(pmlsaab_a); } } -void saab3_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) { +void saab3_pml::Step(solver_t& solver, deviceMemory &o_q, dfloat time, dfloat _dt, int order) { //rhs at current index - occa::memory o_rhsq0 = o_rhsq + shiftIndex*N*sizeof(dfloat); - occa::memory o_rhspmlq0; + deviceMemory o_rhsq0 = o_rhsq + shiftIndex*N; + deviceMemory o_rhspmlq0; //coefficients at current order - occa::memory o_X = o_saab_x; - occa::memory o_A = o_saab_a + order*Nstages*sizeof(dfloat); - occa::memory o_pmlA; + deviceMemory o_X = o_saab_x; + deviceMemory o_A = o_saab_a + order*Nstages; + deviceMemory o_pmlA; if (Npml) { - o_rhspmlq0 = o_rhspmlq + shiftIndex*Npml*sizeof(dfloat); - o_pmlA = o_pmlsaab_a + order*Nstages*sizeof(dfloat); + o_rhspmlq0 = o_rhspmlq + shiftIndex*Npml; + o_pmlA = o_pmlsaab_a + order*Nstages; } //evaluate ODE rhs = f(q,t) @@ -297,12 +293,6 @@ void saab3_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) { shiftIndex = (shiftIndex+Nstages-1)%Nstages; } -saab3_pml::~saab3_pml() { - if (o_pmlq.size()) o_pmlq.free(); - if (o_rhspmlq.size()) o_rhspmlq.free(); - if (o_pmlsaab_a.size()) o_pmlsaab_a.free(); - - pmlUpdateKernel.free(); -} - } //namespace TimeStepper + +} //namespace libp diff --git a/libs/timeStepper/timeStepperSARK4.cpp b/libs/timeStepper/timeStepperSARK4.cpp index 506db1b54..b3749b958 100644 --- a/libs/timeStepper/timeStepperSARK4.cpp +++ b/libs/timeStepper/timeStepperSARK4.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -29,24 +29,25 @@ SOFTWARE. #include "timeStepper.hpp" #include +namespace libp { + namespace TimeStepper { using std::complex; sark4::sark4(dlong _Nelements, dlong _NhaloElements, int _Np, int _Nfields, - dfloat *_lambda, solver_t& _solver, MPI_Comm _comm): - timeStepper_t(_Nelements, _NhaloElements, _Np, _Nfields, _solver), - comm(_comm), + memory _lambda, + platform_t& _platform, comm_t _comm): + timeStepperBase_t(_Nelements, _NhaloElements, _Np, _Nfields, + _platform, _comm), Np(_Np), Nfields(_Nfields), Nelements(_Nelements), NhaloElements(_NhaloElements) { - platform_t &platform = solver.platform; - - lambda = (dfloat *) malloc(Nfields*sizeof(dfloat)); - memcpy(lambda, _lambda, Nfields*sizeof(dfloat)); + lambda.malloc(Nfields); + lambda.copyFrom(_lambda); Nrk = 5; order = 4; @@ -55,26 +56,27 @@ sark4::sark4(dlong _Nelements, dlong _NhaloElements, dlong Nlocal = Nelements*Np*Nfields; dlong Ntotal = (Nelements+NhaloElements)*Np*Nfields; - o_rkq = platform.malloc(Ntotal*sizeof(dfloat)); - o_rhsq = platform.malloc(Nlocal*sizeof(dfloat)); - o_rkrhsq = platform.malloc(Nlocal*Nrk*sizeof(dfloat)); - o_rkerr = platform.malloc(Nlocal*sizeof(dfloat)); + o_rkq = platform.malloc(Ntotal); + o_rhsq = platform.malloc(Nlocal); + o_rkrhsq = platform.malloc(Nlocal*Nrk); + o_rkerr = platform.malloc(Nlocal); - o_saveq = platform.malloc(Nlocal*sizeof(dfloat)); + o_saveq = platform.malloc(Nlocal); - Nblock = (N+BLOCKSIZE-1)/BLOCKSIZE; - errtmp = (dfloat*) calloc(Nblock, sizeof(dfloat)); - o_errtmp = platform.malloc(Nblock*sizeof(dfloat)); + const int blocksize=256; - hlong gNlocal = Nlocal; - hlong gNtotal; - MPI_Allreduce(&gNlocal, &gNtotal, 1, MPI_HLONG, MPI_SUM, comm); + Nblock = (N+blocksize-1)/blocksize; + h_errtmp = platform.hostMalloc(Nblock); + o_errtmp = platform.malloc(Nblock); + + hlong gNtotal = Nlocal; + comm.Allreduce(gNtotal); //copy base occa properties from platform - occa::properties kernelInfo = solver.platform.props; + properties_t kernelInfo = platform.props(); //add defines - kernelInfo["defines/" "p_blockSize"] = (int)BLOCKSIZE; + kernelInfo["defines/" "p_blockSize"] = (int)blocksize; kernelInfo["defines/" "p_Nrk"] = (int)Nrk; kernelInfo["defines/" "p_Np"] = (int)Np; kernelInfo["defines/" "p_Nfields"] = (int)Nfields; @@ -96,16 +98,16 @@ sark4::sark4(dlong _Nelements, dlong _NhaloElements, // Semi-Analytic Runge Kutta - order (3) 4 with PID timestep control dfloat _rkC[Nrk] = {0.0, 0.5, 0.5, 1.0, 1.0}; - rkC = (dfloat*) calloc(Nrk, sizeof(dfloat)); - memcpy(rkC, _rkC, Nrk*sizeof(dfloat)); + rkC.malloc(Nrk); + rkC.copyFrom(_rkC); - rkX = (dfloat*) platform.hostMalloc(Nfields*Nrk* sizeof(dfloat), NULL, h_rkX); - rkA = (dfloat*) platform.hostMalloc(Nfields*Nrk*Nrk*sizeof(dfloat), NULL, h_rkA); - rkE = (dfloat*) platform.hostMalloc(Nfields*Nrk* sizeof(dfloat), NULL, h_rkE); + h_rkX = platform.hostMalloc(Nfields*Nrk); + h_rkA = platform.hostMalloc(Nfields*Nrk*Nrk); + h_rkE = platform.hostMalloc(Nfields*Nrk); - o_rkX = platform.malloc(Nfields*Nrk* sizeof(dfloat)); - o_rkA = platform.malloc(Nfields*Nrk*Nrk*sizeof(dfloat)); - o_rkE = platform.malloc(Nfields*Nrk* sizeof(dfloat)); + o_rkX = platform.malloc(Nfields*Nrk); + o_rkA = platform.malloc(Nfields*Nrk*Nrk); + o_rkE = platform.malloc(Nfields*Nrk); dtMIN = 1E-9; //minumum allowed timestep ATOL = 1E-5; //absolute error tolerance @@ -124,16 +126,15 @@ sark4::sark4(dlong _Nelements, dlong _NhaloElements, sqrtinvNtotal = 1.0/sqrt(gNtotal); } -void sark4::Run(occa::memory &o_q, dfloat start, dfloat end) { +void sark4::Run(solver_t& solver, deviceMemory &o_q, dfloat start, dfloat end) { dfloat time = start; - int rank; - MPI_Comm_rank(comm, &rank); + int rank = comm.rank(); solver.Report(time,0); - dfloat outputInterval; + dfloat outputInterval=0.0; solver.settings.getSetting("OUTPUT INTERVAL", outputInterval); dfloat outputTime = time + outputInterval; @@ -145,23 +146,17 @@ void sark4::Run(occa::memory &o_q, dfloat start, dfloat end) { while (time < end) { - if (dt &o_Q) { + o_saveq.copyFrom(o_Q, N); } -void sark4::Restore(occa::memory &o_Q) { - o_saveq.copyTo(o_Q, N*sizeof(dfloat)); +void sark4::Restore(deviceMemory &o_Q) { + o_saveq.copyTo(o_Q, N); } -void sark4::AcceptStep(occa::memory &o_q, occa::memory &o_rq) { - o_q.copyFrom(o_rq, N*sizeof(dfloat)); +void sark4::AcceptStep(deviceMemory &o_q, deviceMemory &o_rq) { + o_q.copyFrom(o_rq, N); } -void sark4::Step(occa::memory &o_q, dfloat time, dfloat _dt) { +void sark4::Step(solver_t& solver, deviceMemory &o_q, dfloat time, dfloat _dt) { //RK step for(int rk=0;rk& o_q){ //Error estimation //E. HAIRER, S.P. NORSETT AND G. WANNER, SOLVING ORDINARY @@ -311,13 +307,12 @@ dfloat sark4::Estimater(occa::memory& o_q){ o_rkerr, o_errtmp); - o_errtmp.copyTo(errtmp); - dfloat localerr = 0; + h_errtmp.copyFrom(o_errtmp); dfloat err = 0; for(dlong n=0;n _lambda, + platform_t& _platform, comm_t _comm): + sark4(_Nelements, _NhaloElements, _Np, _Nfields, _lambda, _platform, _comm), Npml(_Npmlfields*_Np*_NpmlElements) { if (Npml) { - platform_t &platform = solver.platform; - - dfloat *pmlq = (dfloat *) calloc(Npml,sizeof(dfloat)); - o_pmlq = platform.malloc(Npml*sizeof(dfloat), pmlq); - free(pmlq); + memory pmlq(Npml,0.0); + o_pmlq = platform.malloc(pmlq); - o_rkpmlq = platform.malloc(Npml*sizeof(dfloat)); - o_rhspmlq = platform.malloc(Npml*sizeof(dfloat)); - o_rkrhspmlq = platform.malloc(Npml*Nrk*sizeof(dfloat)); + o_rkpmlq = platform.malloc(Npml); + o_rhspmlq = platform.malloc(Npml); + o_rkrhspmlq = platform.malloc(Npml*Nrk); - o_savepmlq = platform.malloc(Npml*sizeof(dfloat)); + o_savepmlq = platform.malloc(Npml); //copy base occa properties from solver - occa::properties kernelInfo = platform.props; + properties_t kernelInfo = platform.props(); + + const int blocksize=256; //add defines - kernelInfo["defines/" "p_blockSize"] = (int)BLOCKSIZE; + kernelInfo["defines/" "p_blockSize"] = (int)blocksize; kernelInfo["defines/" "p_Nrk"] = (int)Nrk; kernelInfo["defines/" "p_Np"] = (int)Np; kernelInfo["defines/" "p_Nfields"] = (int)Nfields; @@ -490,7 +468,7 @@ sark4_pml::sark4_pml(dlong _Nelements, dlong _NpmlElements, dlong _NhaloElements kernelInfo); // Semi-Analytic Runge Kutta - order (3) 4 with PID timestep control - pmlrkA = (dfloat*) malloc(Nrk*Nrk*sizeof(dfloat)); + pmlrkA.malloc(Nrk*Nrk); dfloat _pmlrkA[Nrk*Nrk] = { 0.0, 0.0, 0.0, 0.0, 0.0, @@ -498,31 +476,31 @@ sark4_pml::sark4_pml(dlong _Nelements, dlong _NpmlElements, dlong _NhaloElements 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0/6.0, 1.0/3.0, 1.0/3.0, 1.0/6.0, 0.0}; - memcpy(pmlrkA, _pmlrkA, Nrk*Nrk*sizeof(dfloat)); + pmlrkA.copyFrom(_pmlrkA); - o_pmlrkA = platform.malloc(Nrk*Nrk*sizeof(dfloat), pmlrkA); + o_pmlrkA = platform.malloc(pmlrkA); } } -void sark4_pml::Backup(occa::memory &o_Q) { - o_saveq.copyFrom(o_Q, N*sizeof(dfloat)); +void sark4_pml::Backup(deviceMemory &o_Q) { + o_saveq.copyFrom(o_Q, N); if (Npml) - o_savepmlq.copyFrom(o_rkpmlq, Npml*sizeof(dfloat)); + o_savepmlq.copyFrom(o_rkpmlq, Npml); } -void sark4_pml::Restore(occa::memory &o_Q) { - o_saveq.copyTo(o_Q, N*sizeof(dfloat)); +void sark4_pml::Restore(deviceMemory &o_Q) { + o_saveq.copyTo(o_Q, N); if (Npml) - o_savepmlq.copyTo(o_rkpmlq, Npml*sizeof(dfloat)); + o_savepmlq.copyTo(o_rkpmlq, Npml); } -void sark4_pml::AcceptStep(occa::memory &o_q, occa::memory &o_rq) { - o_q.copyFrom(o_rq, N*sizeof(dfloat)); +void sark4_pml::AcceptStep(deviceMemory &o_q, deviceMemory &o_rq) { + o_q.copyFrom(o_rq, N); if (Npml) - o_pmlq.copyFrom(o_rkpmlq, Npml*sizeof(dfloat)); + o_pmlq.copyFrom(o_rkpmlq, Npml); } -void sark4_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt) { +void sark4_pml::Step(solver_t& solver, deviceMemory &o_q, dfloat time, dfloat _dt) { //RK step for(int rk=0;rk +namespace libp { + namespace TimeStepper { using std::complex; sark5::sark5(dlong _Nelements, dlong _NhaloElements, int _Np, int _Nfields, - dfloat *_lambda, solver_t& _solver, MPI_Comm _comm): - timeStepper_t(_Nelements, _NhaloElements, _Np, _Nfields, _solver), - comm(_comm), + memory _lambda, + platform_t& _platform, comm_t _comm): + timeStepperBase_t(_Nelements, _NhaloElements, _Np, _Nfields, + _platform, _comm), Np(_Np), Nfields(_Nfields), Nelements(_Nelements), NhaloElements(_NhaloElements) { - platform_t &platform = solver.platform; - - lambda = (dfloat *) malloc(Nfields*sizeof(dfloat)); - memcpy(lambda, _lambda, Nfields*sizeof(dfloat)); + lambda.malloc(Nfields); + lambda.copyFrom(_lambda); Nrk = 7; //number of stages order = 5; @@ -55,25 +56,26 @@ sark5::sark5(dlong _Nelements, dlong _NhaloElements, dlong Nlocal = Nelements*Np*Nfields; dlong Ntotal = (Nelements+NhaloElements)*Np*Nfields; - o_rkq = platform.malloc(Ntotal*sizeof(dfloat)); - o_rhsq = platform.malloc(Nlocal*sizeof(dfloat)); - o_rkrhsq = platform.malloc(Nlocal*Nrk*sizeof(dfloat)); - o_rkerr = platform.malloc(Nlocal*sizeof(dfloat)); + o_rkq = platform.malloc(Ntotal); + o_rhsq = platform.malloc(Nlocal); + o_rkrhsq = platform.malloc(Nlocal*Nrk); + o_rkerr = platform.malloc(Nlocal); + + o_saveq = platform.malloc(Nlocal); - o_saveq = platform.malloc(Nlocal*sizeof(dfloat)); + const int blocksize=256; - Nblock = (N+BLOCKSIZE-1)/BLOCKSIZE; - errtmp = (dfloat*) calloc(Nblock, sizeof(dfloat)); - o_errtmp = platform.malloc(Nblock*sizeof(dfloat)); + Nblock = (N+blocksize-1)/blocksize; + h_errtmp = platform.hostMalloc(Nblock); + o_errtmp = platform.malloc(Nblock); - hlong gNlocal = Nlocal; - hlong gNtotal; - MPI_Allreduce(&gNlocal, &gNtotal, 1, MPI_HLONG, MPI_SUM, comm); + hlong gNtotal = Nlocal; + comm.Allreduce(gNtotal); - occa::properties kernelInfo = platform.props; //copy base occa properties from solver + properties_t kernelInfo = platform.props(); //copy base occa properties from solver //add defines - kernelInfo["defines/" "p_blockSize"] = (int)BLOCKSIZE; + kernelInfo["defines/" "p_blockSize"] = (int)blocksize; kernelInfo["defines/" "p_Nrk"] = (int)Nrk; kernelInfo["defines/" "p_Np"] = (int)Np; kernelInfo["defines/" "p_Nfields"] = (int)Nfields; @@ -95,16 +97,16 @@ sark5::sark5(dlong _Nelements, dlong _NhaloElements, // Semi-Analytic Runge Kutta - order (4) 5 with PID timestep control dfloat _rkC[Nrk] = {0.0, 0.25, 0.25, 0.5, 0.75, 1.0, 1.0}; - rkC = (dfloat*) calloc(Nrk, sizeof(dfloat)); - memcpy(rkC, _rkC, Nrk*sizeof(dfloat)); + rkC.malloc(Nrk); + rkC.copyFrom(_rkC); - rkX = (dfloat*) platform.hostMalloc(Nfields*Nrk* sizeof(dfloat), NULL, h_rkX); - rkA = (dfloat*) platform.hostMalloc(Nfields*Nrk*Nrk*sizeof(dfloat), NULL, h_rkA); - rkE = (dfloat*) platform.hostMalloc(Nfields*Nrk* sizeof(dfloat), NULL, h_rkE); + h_rkX = platform.hostMalloc(Nfields*Nrk); + h_rkA = platform.hostMalloc(Nfields*Nrk*Nrk); + h_rkE = platform.hostMalloc(Nfields*Nrk); - o_rkX = platform.malloc(Nfields*Nrk* sizeof(dfloat)); - o_rkA = platform.malloc(Nfields*Nrk*Nrk*sizeof(dfloat)); - o_rkE = platform.malloc(Nfields*Nrk* sizeof(dfloat)); + o_rkX = platform.malloc(Nfields*Nrk); + o_rkA = platform.malloc(Nfields*Nrk*Nrk); + o_rkE = platform.malloc(Nfields*Nrk); dtMIN = 1E-9; //minumum allowed timestep ATOL = 1E-5; //absolute error tolerance @@ -123,16 +125,15 @@ sark5::sark5(dlong _Nelements, dlong _NhaloElements, sqrtinvNtotal = 1.0/sqrt(gNtotal); } -void sark5::Run(occa::memory &o_q, dfloat start, dfloat end) { +void sark5::Run(solver_t& solver, deviceMemory &o_q, dfloat start, dfloat end) { dfloat time = start; - int rank; - MPI_Comm_rank(comm, &rank); + int rank = comm.rank(); solver.Report(time,0); - dfloat outputInterval; + dfloat outputInterval=0.0; solver.settings.getSetting("OUTPUT INTERVAL", outputInterval); dfloat outputTime = time + outputInterval; @@ -144,23 +145,17 @@ void sark5::Run(occa::memory &o_q, dfloat start, dfloat end) { while (time < end) { - if (dt &o_Q) { + o_saveq.copyFrom(o_Q, N); } -void sark5::Restore(occa::memory &o_Q) { - o_saveq.copyTo(o_Q, N*sizeof(dfloat)); +void sark5::Restore(deviceMemory &o_Q) { + o_saveq.copyTo(o_Q, N); } -void sark5::AcceptStep(occa::memory &o_q, occa::memory &o_rq) { - o_q.copyFrom(o_rq, N*sizeof(dfloat)); +void sark5::AcceptStep(deviceMemory &o_q, deviceMemory &o_rq) { + o_q.copyFrom(o_rq, N); } -void sark5::Step(occa::memory &o_q, dfloat time, dfloat _dt) { +void sark5::Step(solver_t& solver, deviceMemory &o_q, dfloat time, dfloat _dt) { //RK step for(int rk=0;rk& o_q){ //Error estimation //E. HAIRER, S.P. NORSETT AND G. WANNER, SOLVING ORDINARY @@ -310,13 +306,12 @@ dfloat sark5::Estimater(occa::memory& o_q){ o_rkerr, o_errtmp); - o_errtmp.copyTo(errtmp); - dfloat localerr = 0; + h_errtmp.copyFrom(o_errtmp); dfloat err = 0; for(dlong n=0;n _lambda, + platform_t& _platform, comm_t _comm): + sark5(_Nelements, _NhaloElements, _Np, _Nfields, _lambda, _platform, _comm), Npml(_Npmlfields*_Np*_NpmlElements) { if (Npml) { - platform_t &platform = solver.platform; + memory pmlq(Npml,0.0); + o_pmlq = platform.malloc(pmlq); - dfloat *pmlq = (dfloat *) calloc(Npml,sizeof(dfloat)); - o_pmlq = platform.malloc(Npml*sizeof(dfloat), pmlq); - free(pmlq); + o_rkpmlq = platform.malloc(Npml); + o_rhspmlq = platform.malloc(Npml); + o_rkrhspmlq = platform.malloc(Npml*Nrk); - o_rkpmlq = platform.malloc(Npml*sizeof(dfloat)); - o_rhspmlq = platform.malloc(Npml*sizeof(dfloat)); - o_rkrhspmlq = platform.malloc(Npml*Nrk*sizeof(dfloat)); + o_savepmlq = platform.malloc(Npml); - o_savepmlq = platform.malloc(Npml*sizeof(dfloat)); + properties_t kernelInfo = platform.props(); //copy base occa properties from solver - occa::properties kernelInfo = platform.props; //copy base occa properties from solver + const int blocksize=256; //add defines - kernelInfo["defines/" "p_blockSize"] = (int)BLOCKSIZE; + kernelInfo["defines/" "p_blockSize"] = (int)blocksize; kernelInfo["defines/" "p_Nrk"] = (int)Nrk; kernelInfo["defines/" "p_Np"] = (int)Np; kernelInfo["defines/" "p_Nfields"] = (int)Nfields; @@ -545,7 +524,7 @@ sark5_pml::sark5_pml(dlong _Nelements, dlong _NpmlElements, dlong _NhaloElements kernelInfo); // Semi-Analytic Runge Kutta - order (3) 4 with PID timestep control - pmlrkA = (dfloat*) malloc(Nrk*Nrk*sizeof(dfloat)); + pmlrkA.malloc(Nrk*Nrk); dfloat _pmlrkA[Nrk*Nrk] = { 0, 0, 0, 0, 0, 0, 0, 1/4, 0, 0, 0, 0, 0, 0, @@ -555,31 +534,31 @@ sark5_pml::sark5_pml(dlong _Nelements, dlong _NpmlElements, dlong _NhaloElements -3./7., 8./7., 6./7., -12./7., 8./7., 0, 0, 7./90., 0., 16./45., 2./15., 16./45., 7./90., 0}; - memcpy(pmlrkA, _pmlrkA, Nrk*Nrk*sizeof(dfloat)); + pmlrkA.copyFrom(_pmlrkA); - o_pmlrkA = platform.malloc(Nrk*Nrk*sizeof(dfloat), pmlrkA); + o_pmlrkA = platform.malloc(pmlrkA); } } -void sark5_pml::Backup(occa::memory &o_Q) { - o_saveq.copyFrom(o_Q, N*sizeof(dfloat)); +void sark5_pml::Backup(deviceMemory &o_Q) { + o_saveq.copyFrom(o_Q, N); if (Npml) - o_savepmlq.copyFrom(o_rkpmlq, Npml*sizeof(dfloat)); + o_savepmlq.copyFrom(o_rkpmlq, Npml); } -void sark5_pml::Restore(occa::memory &o_Q) { - o_saveq.copyTo(o_Q, N*sizeof(dfloat)); +void sark5_pml::Restore(deviceMemory &o_Q) { + o_saveq.copyTo(o_Q, N); if (Npml) - o_savepmlq.copyTo(o_rkpmlq, Npml*sizeof(dfloat)); + o_savepmlq.copyTo(o_rkpmlq, Npml); } -void sark5_pml::AcceptStep(occa::memory &o_q, occa::memory &o_rq) { - o_q.copyFrom(o_rq, N*sizeof(dfloat)); +void sark5_pml::AcceptStep(deviceMemory &o_q, deviceMemory &o_rq) { + o_q.copyFrom(o_rq, N); if (Npml) - o_pmlq.copyFrom(o_rkpmlq, Npml*sizeof(dfloat)); + o_pmlq.copyFrom(o_rkpmlq, Npml); } -void sark5_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt) { +void sark5_pml::Step(solver_t& solver, deviceMemory &o_q, dfloat time, dfloat _dt) { //RK step for(int rk=0;rk(Nstages*N); //q history + o_qhat = platform.malloc(Nstages*N); //F(q) history (explicit part) + o_rhs = platform.malloc(N); //rhs storage - occa::properties kernelInfo = platform.props; //copy base occa properties from solver + properties_t kernelInfo = platform.props(); //copy base occa properties from solver - kernelInfo["defines/" "p_blockSize"] = BLOCKSIZE; + const int blocksize=256; + + kernelInfo["defines/" "p_blockSize"] = blocksize; kernelInfo["defines/" "p_Nstages"] = Nstages; rhsKernel = platform.buildKernel(TIMESTEPPER_DIR "/okl/" @@ -59,23 +63,23 @@ ssbdf3::ssbdf3(dlong Nelements, dlong NhaloElements, 3./2., 2., -1./2., 0., 11./6., 3., -3./2., 1./3.}; - ssbdf_b = (dfloat*) calloc(Nstages*(Nstages+1), sizeof(dfloat)); - memcpy(ssbdf_b, _b, Nstages*(Nstages+1)*sizeof(dfloat)); + ssbdf_b.malloc(Nstages*(Nstages+1)); + ssbdf_b.copyFrom(_b); - o_ssbdf_b = platform.malloc(Nstages*(Nstages+1)*sizeof(dfloat), ssbdf_b); + o_ssbdf_b = platform.malloc(ssbdf_b); } -dfloat ssbdf3::getGamma() { - return *(ssbdf_b + (Nstages-1)*(Nstages+1)); //first entry of last row of B +dfloat ssbdf3::GetGamma() { + return ssbdf_b[(Nstages-1)*(Nstages+1)]; //first entry of last row of B } -void ssbdf3::Run(occa::memory &o_q, dfloat start, dfloat end) { +void ssbdf3::Run(solver_t& solver, deviceMemory &o_q, dfloat start, dfloat end) { dfloat time = start; solver.Report(time,0); - dfloat outputInterval; + dfloat outputInterval=0.0; solver.settings.getSetting("OUTPUT INTERVAL", outputInterval); dfloat outputTime = time + outputInterval; @@ -83,7 +87,7 @@ void ssbdf3::Run(occa::memory &o_q, dfloat start, dfloat end) { int tstep=0; int order=0; while (time < end) { - Step(o_q, time, dt, order); + Step(solver, o_q, time, dt, order); time += dt; tstep++; if (order &o_q, dfloat time, dfloat _dt, int order) { //BDF coefficients at current order - occa::memory o_B = o_ssbdf_b + order*(Nstages+1)*sizeof(dfloat); - dfloat *B = ssbdf_b + order*(Nstages+1); + deviceMemory o_B = o_ssbdf_b + order*(Nstages+1); + memory B = ssbdf_b + order*(Nstages+1); //put current q into history - occa::memory o_qn0 = o_qn + shiftIndex*N*sizeof(dfloat); - o_qn0.copyFrom(o_q, N*sizeof(dfloat)); + deviceMemory o_qn0 = o_qn + shiftIndex*N; + o_qn0.copyFrom(o_q, N); // Compute qhat = sum_i=1^s B_i qhat(t_n+1-i) by // where qhat(t) is the Lagrangian state of q @@ -129,15 +133,6 @@ void ssbdf3::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) { shiftIndex = (shiftIndex+Nstages-1)%Nstages; } -ssbdf3::~ssbdf3() { - if (o_rhs.size()) o_rhs.free(); - if (o_qn.size()) o_qn.free(); - if (o_qhat.size()) o_qhat.free(); - if (o_ssbdf_b.size()) o_ssbdf_b.free(); - - if (ssbdf_b) free(ssbdf_b); - - rhsKernel.free(); -} - } //namespace TimeStepper + +} //namespace libp diff --git a/make.top b/make.top index 322ec2399..63aad7879 100644 --- a/make.top +++ b/make.top @@ -2,7 +2,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal @@ -27,57 +27,51 @@ #can be GNU or INTEL LIBP_ARCH=GNU -#absolute path to libparanumal +#absolute path to LIBP export LIBP_DIR:=$(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))) export LIBP_INCLUDE_DIR=${LIBP_DIR}/include -export LIBP_TPL_DIR=${LIBP_DIR}/3rdParty export LIBP_LIBS_DIR=${LIBP_DIR}/libs export LIBP_TEST_DIR=${LIBP_DIR}/test -#paths to installed blas and lapack libraries -export LIBP_BLAS_DIR =/usr/lib/x86_64-linux-gnu -export LIBP_LAPACK_DIR=/usr/lib/x86_64-linux-gnu -export LIBP_BLAS_LIB =-L${LIBP_BLAS_DIR} -lblas -export LIBP_LAPACK_LIB=-L${LIBP_LAPACK_DIR} -llapack +export LIBP_BLAS_DIR?=/usr/lib/x86_64-linux-gnu/openblas-serial +export LIBP_BLAS_LIB=-L${LIBP_BLAS_DIR} -lopenblas #include OCCA export OCCA_DIR=${LIBP_DIR}/occa #compilers to use for C/C++ -export LIBP_MPICC = mpicc -export LIBP_MPICXX= mpic++ -export LIBP_LD = mpic++ +export LIBP_CC = mpicc +export LIBP_CXX= mpic++ +export LIBP_LD = mpic++ -export LIBP_CFLAGS=-O2 -fopenmp -g -Wall -Wshadow -Wno-unused-function -export LIBP_CXXFLAGS=-O2 -fopenmp -g -Wall -Wshadow -Wno-unused-function -std=c++11 - -ifeq (1,${LIBP_COVERAGE}) -export LIBP_CXXFLAGS+= --coverage -fprofile-abs-path -endif - -export LIBP_MPICFLAGS=$(LIBP_CFLAGS) -export LIBP_MPICXXFLAGS=$(LIBP_CXXFLAGS) +export LIBP_INCLUDES=-I${LIBP_INCLUDE_DIR} -I${OCCA_DIR}/include +export LIBP_LIBS= -Wl,-rpath,$(LIBP_BLAS_DIR) ${LIBP_BLAS_LIB} \ + -Wl,-rpath,$(OCCA_DIR)/lib -L$(OCCA_DIR)/lib -locca -export LIBP_LIBS=${LIBP_BLAS_LIB} \ - ${LIBP_LAPACK_LIB} \ - -Wl,-rpath=$(OCCA_DIR)/lib -L$(OCCA_DIR)/lib -locca -export LIBP_DEFINES= +ifneq (,${debug}) + export LIBP_CFLAGS=-O0 -g -Wall -Wshadow -Wno-unused-function -Wno-unknown-pragmas + export LIBP_CXXFLAGS=-O0 -g -Wall -Wshadow -Wno-unused-function -Wno-unknown-pragmas -std=c++17 + export LIBP_DEFINES=-DLIBP_DEBUG +else + export LIBP_CFLAGS=-fopenmp -O3 -Wall -Wshadow -Wno-unused-function + export LIBP_CXXFLAGS=-fopenmp -O3 -Wall -Wshadow -Wno-unused-function -std=c++17 + export LIBP_DEFINES= -export LIBP_INCLUDES=-I${LIBP_INCLUDE_DIR} -I${OCCA_DIR}/include + ifeq (GNU,${LIBP_ARCH}) + LIBP_CFLAGS+= -mavx2 -ftree-vectorize -march=native -DGLIBCXX_PARALLEL + LIBP_CXXFLAGS+= -mavx2 -ftree-vectorize -march=native -DGLIBCXX_PARALLEL + else ifeq (INTEL,${LIBP_ARCH}) + LIBP_CFLAGS+=-funroll-loops -xHost + LIBP_CXXFLAGS+=-funroll-loops -xHost + else + $(error unknown arch for [LIBP_ARCH] specified) + endif +endif -ifeq (GNU,${LIBP_ARCH}) -# LIBP_CXXFLAGS+= -mavx2 -ftree-vectorize -march=native - #-funroll-all-loops - #LIBP_LIBS+=-L/opt/apps/gcc5_2/atlas/3.10.2/lib/ -llapack -latlas -lf77blas -lcblas -lptcblas -lptf77blas -lsatlas -lgfortran -L../../../libxsmm/lib -lxsmm -ldl -else ifeq (INTEL,${LIBP_ARCH}) - LIBP_CXXFLAGS+= -funroll-loops -xHost - LIBP_LIBS+=-L/opt/apps/intel15_3/mkl/11.2.3/lib/intel64 -lmkl_rt - # -fopt-info-vec-missed -fopt-info - #-fopt-info -else - $(error unknown arch for [LIBP_ARCH] specified) +ifeq (1,${LIBP_COVERAGE}) + export LIBP_CXXFLAGS+= --coverage -fprofile-abs-path endif export OBJ_COLOR = \033[0;36m diff --git a/makefile b/makefile index 5b2bc1429..c2befaa55 100644 --- a/makefile +++ b/makefile @@ -2,7 +2,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal diff --git a/occa b/occa index 327582bfb..11552d0dc 160000 --- a/occa +++ b/occa @@ -1 +1 @@ -Subproject commit 327582bfb6667defb008d743961fa38053960214 +Subproject commit 11552d0dc02fb9880f61f46e46115b6a50dada32 diff --git a/solvers/acoustics/acoustics.hpp b/solvers/acoustics/acoustics.hpp index a264ee89b..26bd8043a 100644 --- a/solvers/acoustics/acoustics.hpp +++ b/solvers/acoustics/acoustics.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -36,55 +36,56 @@ SOFTWARE. #define DACOUSTICS LIBP_DIR"/solvers/acoustics/" +using namespace libp; + class acousticsSettings_t: public settings_t { public: - acousticsSettings_t(MPI_Comm& _comm); + acousticsSettings_t(comm_t _comm); void report(); void parseFromFile(platformSettings_t& platformSettings, meshSettings_t& meshSettings, - const string filename); + const std::string filename); }; class acoustics_t: public solver_t { public: - mesh_t &mesh; + mesh_t mesh; int Nfields; - TimeStepper::timeStepper_t* timeStepper; + timeStepper_t timeStepper; - halo_t* traceHalo; + ogs::halo_t traceHalo; - dfloat *q; - occa::memory o_q; + memory q; + deviceMemory o_q; - occa::memory o_Mq; + deviceMemory o_Mq; - occa::kernel volumeKernel; - occa::kernel surfaceKernel; + kernel_t volumeKernel; + kernel_t surfaceKernel; - occa::kernel initialConditionKernel; + kernel_t initialConditionKernel; - acoustics_t() = delete; + acoustics_t() = default; acoustics_t(platform_t &_platform, mesh_t &_mesh, - acousticsSettings_t& _settings): - solver_t(_platform, _settings), mesh(_mesh) {} - - ~acoustics_t(); + acousticsSettings_t& _settings) { + Setup(_platform, _mesh, _settings); + } //setup - static acoustics_t& Setup(platform_t& platform, mesh_t& mesh, - acousticsSettings_t& settings); + void Setup(platform_t& _platform, mesh_t& _mesh, + acousticsSettings_t& _settings); void Run(); void Report(dfloat time, int tstep); - void PlotFields(dfloat* Q, char *fileName); + void PlotFields(memory Q, const std::string fileName); - void rhsf(occa::memory& o_q, occa::memory& o_rhs, const dfloat time); + void rhsf(deviceMemory& o_q, deviceMemory& o_rhs, const dfloat time); dfloat MaxWaveSpeed(); }; -#endif \ No newline at end of file +#endif diff --git a/solvers/acoustics/acousticsMain.cpp b/solvers/acoustics/acousticsMain.cpp index b9de52308..834f55bb3 100644 --- a/solvers/acoustics/acousticsMain.cpp +++ b/solvers/acoustics/acousticsMain.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -29,39 +29,40 @@ SOFTWARE. int main(int argc, char **argv){ // start up MPI - MPI_Init(&argc, &argv); + Comm::Init(argc, argv); - MPI_Comm comm = MPI_COMM_WORLD; + LIBP_ABORT("Usage: ./acousticsMain setupfile", argc!=2); - if(argc!=2) - LIBP_ABORT(string("Usage: ./acousticsMain setupfile")); + { /*Scope so everything is destructed before MPI_Finalize */ + comm_t comm(Comm::World().Dup()); - //create default settings - platformSettings_t platformSettings(comm); - meshSettings_t meshSettings(comm); - acousticsSettings_t acousticsSettings(comm); + //create default settings + platformSettings_t platformSettings(comm); + meshSettings_t meshSettings(comm); + acousticsSettings_t acousticsSettings(comm); - //load settings from file - acousticsSettings.parseFromFile(platformSettings, meshSettings, - argv[1]); + //load settings from file + acousticsSettings.parseFromFile(platformSettings, meshSettings, + argv[1]); - // set up platform - platform_t platform(platformSettings); + // set up platform + platform_t platform(platformSettings); - platformSettings.report(); - meshSettings.report(); - acousticsSettings.report(); + platformSettings.report(); + meshSettings.report(); + acousticsSettings.report(); - // set up mesh - mesh_t& mesh = mesh_t::Setup(platform, meshSettings, comm); + // set up mesh + mesh_t mesh(platform, meshSettings, comm); - // set up acoustics solver - acoustics_t& acoustics = acoustics_t::Setup(platform, mesh, acousticsSettings); + // set up acoustics solver + acoustics_t acoustics(platform, mesh, acousticsSettings); - // run - acoustics.Run(); + // run + acoustics.Run(); + } // close down MPI - MPI_Finalize(); + Comm::Finalize(); return LIBP_SUCCESS; } diff --git a/solvers/acoustics/data/acousticsGaussian2D.h b/solvers/acoustics/data/acousticsGaussian2D.h index db34505dd..621a65159 100644 --- a/solvers/acoustics/data/acousticsGaussian2D.h +++ b/solvers/acoustics/data/acousticsGaussian2D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/acoustics/data/acousticsGaussian3D.h b/solvers/acoustics/data/acousticsGaussian3D.h index 14ce2bfe6..cedf44200 100644 --- a/solvers/acoustics/data/acousticsGaussian3D.h +++ b/solvers/acoustics/data/acousticsGaussian3D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -50,4 +50,4 @@ SOFTWARE. *(u) = 0.0; \ *(v) = 0.0; \ *(w) = 0.0; \ -} \ No newline at end of file +} diff --git a/solvers/acoustics/makefile b/solvers/acoustics/makefile index d19c9e494..bf2875350 100644 --- a/solvers/acoustics/makefile +++ b/solvers/acoustics/makefile @@ -2,7 +2,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal @@ -77,11 +77,8 @@ include ../../make.top endif endif -#gslib -GS_DIR=${LIBP_TPL_DIR}/gslib - #libraries -ACOUSTICS_LIBP_LIBS=timeStepper mesh ogs linAlg core +ACOUSTICS_LIBP_LIBS=timeStepper mesh parAdogs ogs linAlg core #includes INCLUDES=${LIBP_INCLUDES} \ @@ -92,11 +89,10 @@ DEFINES =${LIBP_DEFINES} \ -DLIBP_DIR='"${LIBP_DIR}"' #.cpp compilation flags -ACOUSTICS_CXXFLAGS=${LIBP_MPICXXFLAGS} ${DEFINES} ${INCLUDES} +ACOUSTICS_CXXFLAGS=${LIBP_CXXFLAGS} ${DEFINES} ${INCLUDES} #link libraries LIBS=-L${LIBP_LIBS_DIR} $(addprefix -l,$(ACOUSTICS_LIBP_LIBS)) \ - -L$(GS_DIR)/lib -lgs \ ${LIBP_LIBS} #link flags @@ -144,10 +140,10 @@ endif # rule for .cpp files %.o: %.cpp $(DEPS) | libp_libs ifneq (,${verbose}) - $(LIBP_MPICXX) -o $*.o -c $*.cpp $(ACOUSTICS_CXXFLAGS) + $(LIBP_CXX) -o $*.o -c $*.cpp $(ACOUSTICS_CXXFLAGS) else @printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n"; - @$(LIBP_MPICXX) -o $*.o -c $*.cpp $(ACOUSTICS_CXXFLAGS) + @$(LIBP_CXX) -o $*.o -c $*.cpp $(ACOUSTICS_CXXFLAGS) endif #cleanup @@ -158,8 +154,7 @@ clean-libs: clean ${MAKE} -C ${LIBP_LIBS_DIR} clean clean-kernels: clean-libs -# $(shell ${OCCA_DIR}/bin/occa clear all -y) - rm -rf ~/.occa/ + rm -rf ${LIBP_DIR}/.occa/ realclean: clean ${MAKE} -C ${LIBP_LIBS_DIR} realclean @@ -177,4 +172,4 @@ info: @true test: acousticsMain - @${MAKE} -C $(LIBP_TEST_DIR) --no-print-directory test-acoustics \ No newline at end of file + @${MAKE} -C $(LIBP_TEST_DIR) --no-print-directory test-acoustics diff --git a/solvers/acoustics/okl/acousticsInitialCondition2D.okl b/solvers/acoustics/okl/acousticsInitialCondition2D.okl index 71255c37d..7fcfde446 100644 --- a/solvers/acoustics/okl/acousticsInitialCondition2D.okl +++ b/solvers/acoustics/okl/acousticsInitialCondition2D.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/acoustics/okl/acousticsInitialCondition3D.okl b/solvers/acoustics/okl/acousticsInitialCondition3D.okl index 23972ab5a..c2cc6f0f8 100644 --- a/solvers/acoustics/okl/acousticsInitialCondition3D.okl +++ b/solvers/acoustics/okl/acousticsInitialCondition3D.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/acoustics/okl/acousticsSurfaceHex3D.okl b/solvers/acoustics/okl/acousticsSurfaceHex3D.okl index b102d29a8..972d6c96a 100644 --- a/solvers/acoustics/okl/acousticsSurfaceHex3D.okl +++ b/solvers/acoustics/okl/acousticsSurfaceHex3D.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -116,6 +116,7 @@ void surfaceTerms(const int e, // batch process elements @kernel void acousticsSurfaceHex3D(const dlong Nelements, + @restrict const dlong * elementIds, @restrict const dfloat * sgeo, @restrict const dfloat * LIFT, @restrict const dlong * vmapM, @@ -131,63 +132,66 @@ void surfaceTerms(const int e, // for all elements for(dlong eo=0;eo0){ acousticsDirichletConditions3D(bc, time, x[idM], y[idM], z[idM], nx, ny, nz, rM, uM, vM, wM, &rP, &uP, &vP, &wP); } @@ -129,7 +134,7 @@ void upwind(const dfloat nx, upwind(nx, ny, nz, rM, uM, vM, wM, rP, uP, vP, wP, &rflux, &uflux, &vflux, &wflux); - s_rflux[es][n] = sc*(-rflux ); + s_rflux[es][n] = sc*(-rflux ); s_uflux[es][n] = sc*(-uflux); s_vflux[es][n] = sc*(-vflux); s_wflux[es][n] = sc*(-wflux); @@ -138,14 +143,10 @@ void upwind(const dfloat nx, } } - // wait for all @shared memory writes of the previous inner loop to complete - @barrier("local"); - // for each node in the element for(int es=0;es0){ acousticsDirichletConditions2D(bc, time, x[idM], y[idM], nx, ny, rM, uM, vM, &rP, &uP, &vP); //should also add the Neumann BC here, but need uxM, uyM, vxM, abd vyM somehow @@ -131,14 +136,10 @@ void upwind(const dfloat nx, } } - // wait for all @shared memory writes of the previous inner loop to complete - @barrier("local"); - // for each node in the element for(int es=0;es Q, const std::string fileName){ FILE *fp; - fp = fopen(fileName, "w"); + fp = fopen(fileName.c_str(), "w"); fprintf(fp, "\n"); fprintf(fp, " \n"); @@ -44,33 +44,39 @@ void acoustics_t::PlotFields(dfloat* Q, char *fileName){ fprintf(fp, " \n"); //scratch space for interpolation - size_t NscratchBytes = mymax(mesh.Np, mesh.plotNp)*sizeof(dfloat); - dfloat* scratch = (dfloat *) malloc(2*NscratchBytes); + size_t Nscratch = std::max(mesh.Np, mesh.plotNp); + memory scratch(2*Nscratch); - dfloat* Ix = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iy = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iz = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); + memory Ix(mesh.plotNp); + memory Iy(mesh.plotNp); + memory Iz(mesh.plotNp); // compute plot node coordinates on the fly for(dlong e=0;e\n"); fprintf(fp, " \n"); - free(Ix); free(Iy); free(Iz); - - dfloat* Ip = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iu = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iv = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iw = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); + memory Ip(mesh.plotNp); + memory Iu(mesh.plotNp); + memory Iv(mesh.plotNp); + memory Iw(mesh.plotNp); // write out density fprintf(fp, " \n"); @@ -105,8 +111,6 @@ void acoustics_t::PlotFields(dfloat* Q, char *fileName){ fprintf(fp, " \n"); fprintf(fp, " \n"); - free(Ip); free(Iu); free(Iv); free(Iw); - fprintf(fp, " \n"); fprintf(fp, " \n"); @@ -147,6 +151,4 @@ void acoustics_t::PlotFields(dfloat* Q, char *fileName){ fprintf(fp, " \n"); fprintf(fp, "\n"); fclose(fp); - - free(scratch); } diff --git a/solvers/acoustics/src/acousticsReport.cpp b/solvers/acoustics/src/acousticsReport.cpp index b5d1876ba..cf7913a32 100644 --- a/solvers/acoustics/src/acousticsReport.cpp +++ b/solvers/acoustics/src/acousticsReport.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -34,7 +34,7 @@ void acoustics_t::Report(dfloat time, int tstep){ mesh.MassMatrixApply(o_q, o_Mq); dlong Nentries = mesh.Nelements*mesh.Np*Nfields; - dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_q, o_Mq, mesh.comm)); + dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_q, o_Mq, mesh.comm)); if(mesh.rank==0) printf("%5.2f (%d), %5.2f (time, timestep, norm)\n", time, tstep, norm2); @@ -45,11 +45,11 @@ void acoustics_t::Report(dfloat time, int tstep){ o_q.copyTo(q); // output field files - string name; + std::string name; settings.getSetting("OUTPUT FILE NAME", name); char fname[BUFSIZ]; sprintf(fname, "%s_%04d_%04d.vtu", name.c_str(), mesh.rank, frame++); - PlotFields(q, fname); + PlotFields(q, std::string(fname)); } } diff --git a/solvers/acoustics/src/acousticsRun.cpp b/solvers/acoustics/src/acousticsRun.cpp index 188a7ef17..ee908a769 100644 --- a/solvers/acoustics/src/acousticsRun.cpp +++ b/solvers/acoustics/src/acousticsRun.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -47,9 +47,9 @@ void acoustics_t::Run(){ dfloat vmax = MaxWaveSpeed(); dfloat dt = cfl*hmin/(vmax*(mesh.N+1.)*(mesh.N+1.)); - timeStepper->SetTimeStep(dt); + timeStepper.SetTimeStep(dt); - timeStepper->Run(o_q, startTime, finalTime); + timeStepper.Run(*this, o_q, startTime, finalTime); // output norm of final solution { @@ -57,7 +57,7 @@ void acoustics_t::Run(){ mesh.MassMatrixApply(o_q, o_Mq); dlong Nentries = mesh.Nelements*mesh.Np*Nfields; - dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_q, o_Mq, mesh.comm)); + dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_q, o_Mq, mesh.comm)); if(mesh.rank==0) printf("Solution norm = %17.15lg\n", norm2); diff --git a/solvers/acoustics/src/acousticsSettings.cpp b/solvers/acoustics/src/acousticsSettings.cpp index 172b2962d..9198e3261 100644 --- a/solvers/acoustics/src/acousticsSettings.cpp +++ b/solvers/acoustics/src/acousticsSettings.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,7 +27,7 @@ SOFTWARE. #include "acoustics.hpp" //settings for acoustics solver -acousticsSettings_t::acousticsSettings_t(MPI_Comm& _comm): +acousticsSettings_t::acousticsSettings_t(comm_t _comm): settings_t(_comm) { newSetting("DATA FILE", @@ -66,10 +66,7 @@ acousticsSettings_t::acousticsSettings_t(MPI_Comm& _comm): void acousticsSettings_t::report() { - int rank; - MPI_Comm_rank(comm, &rank); - - if (rank==0) { + if (comm.rank()==0) { std::cout << "Acoustics Settings:\n\n"; reportSetting("DATA FILE"); reportSetting("TIME INTEGRATOR"); @@ -83,15 +80,15 @@ void acousticsSettings_t::report() { void acousticsSettings_t::parseFromFile(platformSettings_t& platformSettings, meshSettings_t& meshSettings, - const string filename) { + const std::string filename) { //read all settings from file settings_t s(comm); s.readSettingsFromFile(filename); for(auto it = s.settings.begin(); it != s.settings.end(); ++it) { - setting_t* set = it->second; - const string name = set->getName(); - const string val = set->getVal(); + setting_t& set = it->second; + const std::string name = set.getName(); + const std::string val = set.getVal(); if (platformSettings.hasSetting(name)) platformSettings.changeSetting(name, val); else if (meshSettings.hasSetting(name)) @@ -99,9 +96,7 @@ void acousticsSettings_t::parseFromFile(platformSettings_t& platformSettings, else if (hasSetting(name)) //self changeSetting(name, val); else { - stringstream ss; - ss << "Unknown setting: [" << name << "] requested"; - LIBP_ABORT(ss.str()); + LIBP_FORCE_ABORT("Unknown setting: [" << name << "] requested"); } } -} \ No newline at end of file +} diff --git a/solvers/acoustics/src/acousticsSetup.cpp b/solvers/acoustics/src/acousticsSetup.cpp index 1de7585ec..4dea74a82 100644 --- a/solvers/acoustics/src/acousticsSetup.cpp +++ b/solvers/acoustics/src/acousticsSetup.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,120 +26,118 @@ SOFTWARE. #include "acoustics.hpp" -acoustics_t& acoustics_t::Setup(platform_t& platform, mesh_t& mesh, - acousticsSettings_t& settings){ +void acoustics_t::Setup(platform_t& _platform, mesh_t& _mesh, + acousticsSettings_t& _settings){ - acoustics_t* acoustics = new acoustics_t(platform, mesh, settings); + platform = _platform; + mesh = _mesh; + comm = _mesh.comm; + settings = _settings; - acoustics->Nfields = (mesh.dim==3) ? 4:3; + Nfields = (mesh.dim==3) ? 4:3; - dlong Nlocal = mesh.Nelements*mesh.Np*acoustics->Nfields; - dlong Nhalo = mesh.totalHaloPairs*mesh.Np*acoustics->Nfields; + dlong Nlocal = mesh.Nelements*mesh.Np*Nfields; + dlong Nhalo = mesh.totalHaloPairs*mesh.Np*Nfields; + + //Trigger JIT kernel builds + ogs::InitializeKernels(platform, ogs::Dfloat, ogs::Add); + + //setup linear algebra module + platform.linAlg().InitKernels({"innerProd"}); + + /*setup trace halo exchange */ + traceHalo = mesh.HaloTraceSetup(Nfields); //setup timeStepper if (settings.compareSetting("TIME INTEGRATOR","AB3")){ - acoustics->timeStepper = new TimeStepper::ab3(mesh.Nelements, mesh.totalHaloPairs, - mesh.Np, acoustics->Nfields, *acoustics); + timeStepper.Setup(mesh.Nelements, + mesh.totalHaloPairs, + mesh.Np, Nfields, platform, comm); } else if (settings.compareSetting("TIME INTEGRATOR","LSERK4")){ - acoustics->timeStepper = new TimeStepper::lserk4(mesh.Nelements, mesh.totalHaloPairs, - mesh.Np, acoustics->Nfields, *acoustics); + timeStepper.Setup(mesh.Nelements, + mesh.totalHaloPairs, + mesh.Np, Nfields, platform, comm); } else if (settings.compareSetting("TIME INTEGRATOR","DOPRI5")){ - acoustics->timeStepper = new TimeStepper::dopri5(mesh.Nelements, mesh.totalHaloPairs, - mesh.Np, acoustics->Nfields, *acoustics, mesh.comm); + timeStepper.Setup(mesh.Nelements, + mesh.totalHaloPairs, + mesh.Np, Nfields, platform, comm); } - //setup linear algebra module - platform.linAlg.InitKernels({"innerProd"}); - // set penalty parameter dfloat Lambda2 = 0.5; - /*setup trace halo exchange */ - acoustics->traceHalo = mesh.HaloTraceSetup(acoustics->Nfields); - // compute samples of q at interpolation nodes - acoustics->q = (dfloat*) calloc(Nlocal+Nhalo, sizeof(dfloat)); - acoustics->o_q = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), acoustics->q); + q.malloc(Nlocal+Nhalo); + o_q = platform.malloc(q); //storage for M*q during reporting - acoustics->o_Mq = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), acoustics->q); - mesh.MassMatrixKernelSetup(acoustics->Nfields); // mass matrix operator + o_Mq = platform.malloc(q); + mesh.MassMatrixKernelSetup(Nfields); // mass matrix operator // OCCA build stuff - occa::properties kernelInfo = mesh.props; //copy base occa properties + properties_t kernelInfo = mesh.props; //copy base occa properties //add boundary data to kernel info - string dataFileName; + std::string dataFileName; settings.getSetting("DATA FILE", dataFileName); kernelInfo["includes"] += dataFileName; - - kernelInfo["defines/" "p_Nfields"]= acoustics->Nfields; + kernelInfo["defines/" "p_Nfields"]= Nfields; const dfloat p_half = 1./2.; kernelInfo["defines/" "p_half"]= p_half; - int maxNodes = mymax(mesh.Np, (mesh.Nfp*mesh.Nfaces)); + int maxNodes = std::max(mesh.Np, (mesh.Nfp*mesh.Nfaces)); kernelInfo["defines/" "p_maxNodes"]= maxNodes; int blockMax = 256; if (platform.device.mode() == "CUDA") blockMax = 512; - int NblockV = mymax(1, blockMax/mesh.Np); + int NblockV = std::max(1, blockMax/mesh.Np); kernelInfo["defines/" "p_NblockV"]= NblockV; - int NblockS = mymax(1, blockMax/maxNodes); + int NblockS = std::max(1, blockMax/maxNodes); kernelInfo["defines/" "p_NblockS"]= NblockS; kernelInfo["defines/" "p_Lambda2"]= Lambda2; - kernelInfo["parser/" "automate-add-barriers"] = "disabled"; - // set kernel name suffix - char *suffix; - if(mesh.elementType==TRIANGLES) - suffix = strdup("Tri2D"); - if(mesh.elementType==QUADRILATERALS) - suffix = strdup("Quad2D"); - if(mesh.elementType==TETRAHEDRA) - suffix = strdup("Tet3D"); - if(mesh.elementType==HEXAHEDRA) - suffix = strdup("Hex3D"); - - char fileName[BUFSIZ], kernelName[BUFSIZ]; + std::string suffix; + if(mesh.elementType==Mesh::TRIANGLES) + suffix = "Tri2D"; + if(mesh.elementType==Mesh::QUADRILATERALS) + suffix = "Quad2D"; + if(mesh.elementType==Mesh::TETRAHEDRA) + suffix = "Tet3D"; + if(mesh.elementType==Mesh::HEXAHEDRA) + suffix = "Hex3D"; + + std::string oklFilePrefix = DACOUSTICS "/okl/"; + std::string oklFileSuffix = ".okl"; + + std::string fileName, kernelName; // kernels from volume file - sprintf(fileName, DACOUSTICS "/okl/acousticsVolume%s.okl", suffix); - sprintf(kernelName, "acousticsVolume%s", suffix); + fileName = oklFilePrefix + "acousticsVolume" + suffix + oklFileSuffix; + kernelName = "acousticsVolume" + suffix; - acoustics->volumeKernel = platform.buildKernel(fileName, kernelName, + volumeKernel = platform.buildKernel(fileName, kernelName, kernelInfo); // kernels from surface file - sprintf(fileName, DACOUSTICS "/okl/acousticsSurface%s.okl", suffix); - sprintf(kernelName, "acousticsSurface%s", suffix); + fileName = oklFilePrefix + "acousticsSurface" + suffix + oklFileSuffix; + kernelName = "acousticsSurface" + suffix; - acoustics->surfaceKernel = platform.buildKernel(fileName, kernelName, + surfaceKernel = platform.buildKernel(fileName, kernelName, kernelInfo); if (mesh.dim==2) { - sprintf(fileName, DACOUSTICS "/okl/acousticsInitialCondition2D.okl"); - sprintf(kernelName, "acousticsInitialCondition2D"); + fileName = oklFilePrefix + "acousticsInitialCondition2D" + oklFileSuffix; + kernelName = "acousticsInitialCondition2D"; } else { - sprintf(fileName, DACOUSTICS "/okl/acousticsInitialCondition3D.okl"); - sprintf(kernelName, "acousticsInitialCondition3D"); + fileName = oklFilePrefix + "acousticsInitialCondition3D" + oklFileSuffix; + kernelName = "acousticsInitialCondition3D"; } - acoustics->initialConditionKernel = platform.buildKernel(fileName, kernelName, + initialConditionKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - - return *acoustics; -} - -acoustics_t::~acoustics_t() { - volumeKernel.free(); - surfaceKernel.free(); - initialConditionKernel.free(); - - if (timeStepper) delete timeStepper; - if (traceHalo) traceHalo->Free(); } diff --git a/solvers/acoustics/src/acousticsStep.cpp b/solvers/acoustics/src/acousticsStep.cpp index 5efb33271..a2d040a7e 100644 --- a/solvers/acoustics/src/acousticsStep.cpp +++ b/solvers/acoustics/src/acousticsStep.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -33,10 +33,10 @@ dfloat acoustics_t::MaxWaveSpeed(){ } //evaluate ODE rhs = f(q,t) -void acoustics_t::rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){ +void acoustics_t::rhsf(deviceMemory& o_Q, deviceMemory& o_RHS, const dfloat T){ // extract q halo on DEVICE - traceHalo->ExchangeStart(o_Q, 1, ogs_dfloat); + traceHalo.ExchangeStart(o_Q, 1); volumeKernel(mesh.Nelements, mesh.o_vgeo, @@ -44,18 +44,35 @@ void acoustics_t::rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){ o_Q, o_RHS); - traceHalo->ExchangeFinish(o_Q, 1, ogs_dfloat); - - surfaceKernel(mesh.Nelements, - mesh.o_sgeo, - mesh.o_LIFT, - mesh.o_vmapM, - mesh.o_vmapP, - mesh.o_EToB, - T, - mesh.o_x, - mesh.o_y, - mesh.o_z, - o_Q, - o_RHS); + if (mesh.NinternalElements) + surfaceKernel(mesh.NinternalElements, + mesh.o_internalElementIds, + mesh.o_sgeo, + mesh.o_LIFT, + mesh.o_vmapM, + mesh.o_vmapP, + mesh.o_EToB, + T, + mesh.o_x, + mesh.o_y, + mesh.o_z, + o_Q, + o_RHS); + + traceHalo.ExchangeFinish(o_Q, 1); + + if (mesh.NhaloElements) + surfaceKernel(mesh.NhaloElements, + mesh.o_haloElementIds, + mesh.o_sgeo, + mesh.o_LIFT, + mesh.o_vmapM, + mesh.o_vmapP, + mesh.o_EToB, + T, + mesh.o_x, + mesh.o_y, + mesh.o_z, + o_Q, + o_RHS); } diff --git a/solvers/advection/advection.hpp b/solvers/advection/advection.hpp index 5ec62bbec..62d5ed434 100644 --- a/solvers/advection/advection.hpp +++ b/solvers/advection/advection.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -36,53 +36,54 @@ SOFTWARE. #define DADVECTION LIBP_DIR"/solvers/advection/" +using namespace libp; + class advectionSettings_t: public settings_t { public: - advectionSettings_t(MPI_Comm& _comm); + advectionSettings_t(comm_t _comm); void report(); void parseFromFile(platformSettings_t& platformSettings, meshSettings_t& meshSettings, - const string filename); + const std::string filename); }; class advection_t: public solver_t { public: - mesh_t &mesh; - TimeStepper::timeStepper_t* timeStepper; + mesh_t mesh; + timeStepper_t timeStepper; - halo_t* traceHalo; + ogs::halo_t traceHalo; - dfloat *q; - occa::memory o_q; + memory q; + deviceMemory o_q; - occa::memory o_Mq; + deviceMemory o_Mq; - occa::kernel volumeKernel; - occa::kernel surfaceKernel; + kernel_t volumeKernel; + kernel_t surfaceKernel; - occa::kernel initialConditionKernel; - occa::kernel maxWaveSpeedKernel; + kernel_t initialConditionKernel; + kernel_t maxWaveSpeedKernel; - advection_t() = delete; + advection_t() = default; advection_t(platform_t &_platform, mesh_t &_mesh, - advectionSettings_t& _settings): - solver_t(_platform, _settings), mesh(_mesh) {} - - ~advection_t(); + advectionSettings_t& _settings) { + Setup(_platform, _mesh, _settings); + } //setup - static advection_t& Setup(platform_t& platform, mesh_t& mesh, - advectionSettings_t& settings); + void Setup(platform_t& platform, mesh_t& mesh, + advectionSettings_t& settings); void Run(); void Report(dfloat time, int tstep); - void PlotFields(dfloat* Q, char *fileName); + void PlotFields(memory Q, const std::string fileName); - void rhsf(occa::memory& o_q, occa::memory& o_rhs, const dfloat time); + void rhsf(deviceMemory& o_q, deviceMemory& o_rhs, const dfloat time); - dfloat MaxWaveSpeed(occa::memory& o_Q, const dfloat T); + dfloat MaxWaveSpeed(deviceMemory& o_Q, const dfloat T); }; #endif diff --git a/solvers/advection/advectionMain.cpp b/solvers/advection/advectionMain.cpp index e45076f2e..ba7c77ff6 100644 --- a/solvers/advection/advectionMain.cpp +++ b/solvers/advection/advectionMain.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -29,38 +29,39 @@ SOFTWARE. int main(int argc, char **argv){ // start up MPI - MPI_Init(&argc, &argv); + Comm::Init(argc, argv); - MPI_Comm comm = MPI_COMM_WORLD; + LIBP_ABORT("Usage: ./advectionMain setupfile", argc!=2); - if(argc!=2) - LIBP_ABORT(string("Usage: ./advectionMain setupfile")); + { /*Scope so everything is destructed before MPI_Finalize */ + comm_t comm(Comm::World().Dup()); - //create default settings - platformSettings_t platformSettings(comm); - meshSettings_t meshSettings(comm); - advectionSettings_t advectionSettings(comm); + //create default settings + platformSettings_t platformSettings(comm); + meshSettings_t meshSettings(comm); + advectionSettings_t advectionSettings(comm); - //load settings from file - advectionSettings.parseFromFile(platformSettings, meshSettings, argv[1]); + //load settings from file + advectionSettings.parseFromFile(platformSettings, meshSettings, argv[1]); - // set up platform - platform_t platform(platformSettings); + // set up platform + platform_t platform(platformSettings); - platformSettings.report(); - meshSettings.report(); - advectionSettings.report(); + platformSettings.report(); + meshSettings.report(); + advectionSettings.report(); - // set up mesh - mesh_t& mesh = mesh_t::Setup(platform, meshSettings, comm); + // set up mesh + mesh_t mesh(platform, meshSettings, comm); - // set up advection solver - advection_t& advection = advection_t::Setup(platform, mesh, advectionSettings); + // set up advection solver + advection_t advection(platform, mesh, advectionSettings); - // run - advection.Run(); + // run + advection.Run(); + } // close down MPI - MPI_Finalize(); + Comm::Finalize(); return LIBP_SUCCESS; } diff --git a/solvers/advection/data/advectionLinear2D.h b/solvers/advection/data/advectionLinear2D.h index eab471a34..03be74d4c 100644 --- a/solvers/advection/data/advectionLinear2D.h +++ b/solvers/advection/data/advectionLinear2D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/advection/data/advectionLinear3D.h b/solvers/advection/data/advectionLinear3D.h index 1dfeaa752..9234bc3dd 100644 --- a/solvers/advection/data/advectionLinear3D.h +++ b/solvers/advection/data/advectionLinear3D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/advection/makefile b/solvers/advection/makefile index 34add97f0..497f3157c 100644 --- a/solvers/advection/makefile +++ b/solvers/advection/makefile @@ -2,7 +2,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal @@ -78,11 +78,8 @@ include ../../make.top endif endif -#gslib -GS_DIR=${LIBP_TPL_DIR}/gslib - #libraries -ADVECTION_LIBP_LIBS=timeStepper mesh ogs linAlg core +ADVECTION_LIBP_LIBS=timeStepper mesh parAdogs ogs linAlg core #includes INCLUDES=${LIBP_INCLUDES} \ @@ -93,11 +90,10 @@ DEFINES =${LIBP_DEFINES} \ -DLIBP_DIR='"${LIBP_DIR}"' #.cpp compilation flags -ADVECTION_CXXFLAGS=${LIBP_MPICXXFLAGS} ${DEFINES} ${INCLUDES} +ADVECTION_CXXFLAGS=${LIBP_CXXFLAGS} ${DEFINES} ${INCLUDES} #link libraries LIBS=-L${LIBP_LIBS_DIR} $(addprefix -l,$(ADVECTION_LIBP_LIBS)) \ - -L$(GS_DIR)/lib -lgs \ ${LIBP_LIBS} #link flags @@ -145,10 +141,10 @@ endif # rule for .cpp files %.o: %.cpp $(DEPS) | libp_libs ifneq (,${verbose}) - $(LIBP_MPICXX) -o $*.o -c $*.cpp $(ADVECTION_CXXFLAGS) + $(LIBP_CXX) -o $*.o -c $*.cpp $(ADVECTION_CXXFLAGS) else @printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n"; - @$(LIBP_MPICXX) -o $*.o -c $*.cpp $(ADVECTION_CXXFLAGS) + @$(LIBP_CXX) -o $*.o -c $*.cpp $(ADVECTION_CXXFLAGS) endif #cleanup @@ -159,8 +155,7 @@ clean-libs: clean ${MAKE} -C ${LIBP_LIBS_DIR} clean clean-kernels: clean-libs -# $(shell ${OCCA_DIR}/bin/occa clear all -y) - rm -rf ~/.occa/ + rm -rf ${LIBP_DIR}/.occa/ realclean: clean ${MAKE} -C ${LIBP_LIBS_DIR} realclean diff --git a/solvers/advection/okl/advectionInitialCondition2D.okl b/solvers/advection/okl/advectionInitialCondition2D.okl index bc73181a7..b4f171be6 100644 --- a/solvers/advection/okl/advectionInitialCondition2D.okl +++ b/solvers/advection/okl/advectionInitialCondition2D.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/advection/okl/advectionInitialCondition3D.okl b/solvers/advection/okl/advectionInitialCondition3D.okl index aeb36efcb..c7e6961b5 100644 --- a/solvers/advection/okl/advectionInitialCondition3D.okl +++ b/solvers/advection/okl/advectionInitialCondition3D.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/advection/okl/advectionMaxWaveSpeedHex3D.okl b/solvers/advection/okl/advectionMaxWaveSpeedHex3D.okl index b4334d389..4660d4beb 100644 --- a/solvers/advection/okl/advectionMaxWaveSpeedHex3D.okl +++ b/solvers/advection/okl/advectionMaxWaveSpeedHex3D.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,7 +25,7 @@ SOFTWARE. */ @kernel void advectionMaxWaveSpeedHex3D(const dlong Nelements, - @restrict const dfloat * vgeo, + @restrict const dfloat * wJ, @restrict const dfloat * sgeo, @restrict const dlong * vmapM, @restrict const int * EToB, @@ -52,7 +52,7 @@ SOFTWARE. #pragma unroll p_Nq for(int k=0;k Q, const std::string fileName){ FILE *fp; - fp = fopen(fileName, "w"); + fp = fopen(fileName.c_str(), "w"); fprintf(fp, "\n"); fprintf(fp, " \n"); @@ -44,30 +44,36 @@ void advection_t::PlotFields(dfloat* Q, char *fileName){ fprintf(fp, " \n"); //scratch space for interpolation - size_t NscratchBytes = mymax(mesh.Np, mesh.plotNp)*sizeof(dfloat); - dfloat* scratch = (dfloat *) malloc(2*NscratchBytes); + size_t Nscratch = std::max(mesh.Np, mesh.plotNp); + memory scratch(2*Nscratch); - dfloat* Ix = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iy = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iz = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); + memory Ix(mesh.plotNp); + memory Iy(mesh.plotNp); + memory Iz(mesh.plotNp); // compute plot node coordinates on the fly for(dlong e=0;e\n"); fprintf(fp, " \n"); - free(Ix); free(Iy); free(Iz); - - dfloat* Ip = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); + memory Ip(mesh.plotNp); // write out field fprintf(fp, " \n"); @@ -83,8 +89,6 @@ void advection_t::PlotFields(dfloat* Q, char *fileName){ fprintf(fp, " \n"); fprintf(fp, " \n"); - free(Ip); - fprintf(fp, " \n"); fprintf(fp, " \n"); @@ -125,6 +129,4 @@ void advection_t::PlotFields(dfloat* Q, char *fileName){ fprintf(fp, " \n"); fprintf(fp, "\n"); fclose(fp); - - free(scratch); } diff --git a/solvers/advection/src/advectionReport.cpp b/solvers/advection/src/advectionReport.cpp index 6036f4384..b8026aee0 100644 --- a/solvers/advection/src/advectionReport.cpp +++ b/solvers/advection/src/advectionReport.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -34,7 +34,7 @@ void advection_t::Report(dfloat time, int tstep){ mesh.MassMatrixApply(o_q, o_Mq); dlong Nentries = mesh.Nelements*mesh.Np; - dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_q, o_Mq, mesh.comm)); + dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_q, o_Mq, mesh.comm)); if(mesh.rank==0) printf("%5.2f (%d), %5.2f (time, timestep, norm)\n", time, tstep, norm2); @@ -45,11 +45,11 @@ void advection_t::Report(dfloat time, int tstep){ o_q.copyTo(q); // output field files - string name; + std::string name; settings.getSetting("OUTPUT FILE NAME", name); char fname[BUFSIZ]; sprintf(fname, "%s_%04d_%04d.vtu", name.c_str(), mesh.rank, frame++); - PlotFields(q, fname); + PlotFields(q, std::string(fname)); } } diff --git a/solvers/advection/src/advectionRun.cpp b/solvers/advection/src/advectionRun.cpp index e0d33d35a..613fd1f53 100644 --- a/solvers/advection/src/advectionRun.cpp +++ b/solvers/advection/src/advectionRun.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -46,9 +46,9 @@ void advection_t::Run(){ dfloat vmax = MaxWaveSpeed(o_q, startTime); dfloat dt = cfl/(vmax*(mesh.N+1.)*(mesh.N+1.)); - timeStepper->SetTimeStep(dt); + timeStepper.SetTimeStep(dt); - timeStepper->Run(o_q, startTime, finalTime); + timeStepper.Run(*this, o_q, startTime, finalTime); // output norm of final solution { @@ -56,7 +56,7 @@ void advection_t::Run(){ mesh.MassMatrixApply(o_q, o_Mq); dlong Nentries = mesh.Nelements*mesh.Np; - dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_q, o_Mq, mesh.comm)); + dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_q, o_Mq, mesh.comm)); if(mesh.rank==0) printf("Solution norm = %17.15lg\n", norm2); diff --git a/solvers/advection/src/advectionSettings.cpp b/solvers/advection/src/advectionSettings.cpp index f34793954..d633235be 100644 --- a/solvers/advection/src/advectionSettings.cpp +++ b/solvers/advection/src/advectionSettings.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,7 +27,7 @@ SOFTWARE. #include "advection.hpp" //settings for advection solver -advectionSettings_t::advectionSettings_t(MPI_Comm& _comm): +advectionSettings_t::advectionSettings_t(comm_t _comm): settings_t(_comm) { newSetting("DATA FILE", @@ -66,10 +66,7 @@ advectionSettings_t::advectionSettings_t(MPI_Comm& _comm): void advectionSettings_t::report() { - int rank; - MPI_Comm_rank(comm, &rank); - - if (rank==0) { + if (comm.rank()==0) { std::cout << "Advection Settings:\n\n"; reportSetting("DATA FILE"); reportSetting("TIME INTEGRATOR"); @@ -83,15 +80,15 @@ void advectionSettings_t::report() { void advectionSettings_t::parseFromFile(platformSettings_t& platformSettings, meshSettings_t& meshSettings, - const string filename) { + const std::string filename) { //read all settings from file settings_t s(comm); s.readSettingsFromFile(filename); for(auto it = s.settings.begin(); it != s.settings.end(); ++it) { - setting_t* set = it->second; - const string name = set->getName(); - const string val = set->getVal(); + setting_t& set = it->second; + const std::string name = set.getName(); + const std::string val = set.getVal(); if (platformSettings.hasSetting(name)) platformSettings.changeSetting(name, val); else if (meshSettings.hasSetting(name)) @@ -99,9 +96,7 @@ void advectionSettings_t::parseFromFile(platformSettings_t& platformSettings, else if (hasSetting(name)) //self changeSetting(name, val); else { - stringstream ss; - ss << "Unknown setting: [" << name << "] requested"; - LIBP_ABORT(ss.str()); + LIBP_FORCE_ABORT("Unknown setting: [" << name << "] requested"); } } -} \ No newline at end of file +} diff --git a/solvers/advection/src/advectionSetup.cpp b/solvers/advection/src/advectionSetup.cpp index 918054239..43fb51337 100644 --- a/solvers/advection/src/advectionSetup.cpp +++ b/solvers/advection/src/advectionSetup.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,112 +26,109 @@ SOFTWARE. #include "advection.hpp" -advection_t& advection_t::Setup(platform_t& platform, mesh_t& mesh, - advectionSettings_t& settings){ +void advection_t::Setup(platform_t& _platform, mesh_t& _mesh, + advectionSettings_t& _settings){ - advection_t* advection = new advection_t(platform, mesh, settings); + platform = _platform; + mesh = _mesh; + comm = mesh.comm; + settings = _settings; dlong Nlocal = mesh.Nelements*mesh.Np; dlong Nhalo = mesh.totalHaloPairs*mesh.Np; + //Trigger JIT kernel builds + ogs::InitializeKernels(platform, ogs::Dfloat, ogs::Add); + + //setup linear algebra module + platform.linAlg().InitKernels({"innerProd", "max"}); + + /*setup trace halo exchange */ + traceHalo = mesh.HaloTraceSetup(1); //one field + //setup timeStepper if (settings.compareSetting("TIME INTEGRATOR","AB3")){ - advection->timeStepper = new TimeStepper::ab3(mesh.Nelements, mesh.totalHaloPairs, - mesh.Np, 1, *advection); + timeStepper.Setup(mesh.Nelements, + mesh.totalHaloPairs, + mesh.Np, 1, platform, comm); } else if (settings.compareSetting("TIME INTEGRATOR","LSERK4")){ - advection->timeStepper = new TimeStepper::lserk4(mesh.Nelements, mesh.totalHaloPairs, - mesh.Np, 1, *advection); + timeStepper.Setup(mesh.Nelements, + mesh.totalHaloPairs, + mesh.Np, 1, platform, comm); } else if (settings.compareSetting("TIME INTEGRATOR","DOPRI5")){ - advection->timeStepper = new TimeStepper::dopri5(mesh.Nelements, mesh.totalHaloPairs, - mesh.Np, 1, *advection, mesh.comm); + timeStepper.Setup(mesh.Nelements, + mesh.totalHaloPairs, + mesh.Np, 1, platform, comm); } - //setup linear algebra module - platform.linAlg.InitKernels({"innerProd", "max"}); - - /*setup trace halo exchange */ - advection->traceHalo = mesh.HaloTraceSetup(1); //one field - // compute samples of q at interpolation nodes - advection->q = (dfloat*) calloc(Nlocal+Nhalo, sizeof(dfloat)); - advection->o_q = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), advection->q); + q.malloc(Nlocal+Nhalo); + o_q = platform.malloc(q); //storage for M*q during reporting - advection->o_Mq = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), advection->q); + o_Mq = platform.malloc(q); mesh.MassMatrixKernelSetup(1); // mass matrix operator // OCCA build stuff - occa::properties kernelInfo = mesh.props; //copy base occa properties + properties_t kernelInfo = mesh.props; //copy base occa properties //add boundary data to kernel info - string dataFileName; + std::string dataFileName; settings.getSetting("DATA FILE", dataFileName); kernelInfo["includes"] += dataFileName; - kernelInfo["defines/" "p_Nfields"]= 1; - - int maxNodes = mymax(mesh.Np, (mesh.Nfp*mesh.Nfaces)); + int maxNodes = std::max(mesh.Np, (mesh.Nfp*mesh.Nfaces)); kernelInfo["defines/" "p_maxNodes"]= maxNodes; int blockMax = 256; if (platform.device.mode() == "CUDA") blockMax = 512; - int NblockV = mymax(1, blockMax/mesh.Np); + int NblockV = std::max(1, blockMax/mesh.Np); kernelInfo["defines/" "p_NblockV"]= NblockV; - int NblockS = mymax(1, blockMax/maxNodes); + int NblockS = std::max(1, blockMax/maxNodes); kernelInfo["defines/" "p_NblockS"]= NblockS; - kernelInfo["parser/" "automate-add-barriers"] = "disabled"; - // set kernel name suffix - char *suffix; - if(mesh.elementType==TRIANGLES) - suffix = strdup("Tri2D"); - if(mesh.elementType==QUADRILATERALS) - suffix = strdup("Quad2D"); - if(mesh.elementType==TETRAHEDRA) - suffix = strdup("Tet3D"); - if(mesh.elementType==HEXAHEDRA) - suffix = strdup("Hex3D"); - - char fileName[BUFSIZ], kernelName[BUFSIZ]; + std::string suffix; + if(mesh.elementType==Mesh::TRIANGLES) + suffix = "Tri2D"; + if(mesh.elementType==Mesh::QUADRILATERALS) + suffix = "Quad2D"; + if(mesh.elementType==Mesh::TETRAHEDRA) + suffix = "Tet3D"; + if(mesh.elementType==Mesh::HEXAHEDRA) + suffix = "Hex3D"; + + std::string oklFilePrefix = DADVECTION "/okl/"; + std::string oklFileSuffix = ".okl"; + + std::string fileName, kernelName; // kernels from volume file - sprintf(fileName, DADVECTION "/okl/advectionVolume%s.okl", suffix); - sprintf(kernelName, "advectionVolume%s", suffix); + fileName = oklFilePrefix + "advectionVolume" + suffix + oklFileSuffix; + kernelName = "advectionVolume" + suffix; + + volumeKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - advection->volumeKernel = platform.buildKernel(fileName, kernelName, kernelInfo); // kernels from surface file - sprintf(fileName, DADVECTION "/okl/advectionSurface%s.okl", suffix); - sprintf(kernelName, "advectionSurface%s", suffix); + fileName = oklFilePrefix + "advectionSurface" + suffix + oklFileSuffix; + kernelName = "advectionSurface" + suffix; - advection->surfaceKernel = platform.buildKernel(fileName, kernelName, kernelInfo); + surfaceKernel = platform.buildKernel(fileName, kernelName, kernelInfo); if (mesh.dim==2) { - sprintf(fileName, DADVECTION "/okl/advectionInitialCondition2D.okl"); - sprintf(kernelName, "advectionInitialCondition2D"); + fileName = oklFilePrefix + "advectionInitialCondition2D" + oklFileSuffix; + kernelName = "advectionInitialCondition2D"; } else { - sprintf(fileName, DADVECTION "/okl/advectionInitialCondition3D.okl"); - sprintf(kernelName, "advectionInitialCondition3D"); + fileName = oklFilePrefix + "advectionInitialCondition3D" + oklFileSuffix; + kernelName = "advectionInitialCondition3D"; } - advection->initialConditionKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - - sprintf(fileName, DADVECTION "/okl/advectionMaxWaveSpeed%s.okl", suffix); - sprintf(kernelName, "advectionMaxWaveSpeed%s", suffix); - - advection->maxWaveSpeedKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - - return *advection; -} + initialConditionKernel = platform.buildKernel(fileName, kernelName, kernelInfo); -advection_t::~advection_t() { - volumeKernel.free(); - surfaceKernel.free(); - initialConditionKernel.free(); - maxWaveSpeedKernel.free(); + fileName = oklFilePrefix + "advectionMaxWaveSpeed" + suffix + oklFileSuffix; + kernelName = "advectionMaxWaveSpeed" + suffix; - if (timeStepper) delete timeStepper; - if (traceHalo) traceHalo->Free(); + maxWaveSpeedKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } diff --git a/solvers/advection/src/advectionStep.cpp b/solvers/advection/src/advectionStep.cpp index 9ccbe7a11..99eb5f2a5 100644 --- a/solvers/advection/src/advectionStep.cpp +++ b/solvers/advection/src/advectionStep.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,13 +26,13 @@ SOFTWARE. #include "advection.hpp" -dfloat advection_t::MaxWaveSpeed(occa::memory& o_Q, const dfloat T){ +dfloat advection_t::MaxWaveSpeed(deviceMemory& o_Q, const dfloat T){ //Note: if this is on the critical path in the future, we should pre-allocate this - occa::memory o_maxSpeed = platform.malloc(mesh.Nelements*sizeof(dfloat)); + deviceMemory o_maxSpeed = platform.malloc(mesh.Nelements); maxWaveSpeedKernel(mesh.Nelements, - mesh.o_vgeo, + mesh.o_wJ, mesh.o_sgeo, mesh.o_vmapM, mesh.o_EToB, @@ -43,17 +43,16 @@ dfloat advection_t::MaxWaveSpeed(occa::memory& o_Q, const dfloat T){ o_Q, o_maxSpeed); - const dfloat vmax = platform.linAlg.max(mesh.Nelements, o_maxSpeed, mesh.comm); + const dfloat vmax = platform.linAlg().max(mesh.Nelements, o_maxSpeed, mesh.comm); - o_maxSpeed.free(); return vmax; } //evaluate ODE rhs = f(q,t) -void advection_t::rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){ +void advection_t::rhsf(deviceMemory& o_Q, deviceMemory& o_RHS, const dfloat T){ // extract q halo on DEVICE - traceHalo->ExchangeStart(o_Q, 1, ogs_dfloat); + traceHalo.ExchangeStart(o_Q, 1); volumeKernel(mesh.Nelements, mesh.o_vgeo, @@ -65,7 +64,7 @@ void advection_t::rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){ o_Q, o_RHS); - traceHalo->ExchangeFinish(o_Q, 1, ogs_dfloat); + traceHalo.ExchangeFinish(o_Q, 1); surfaceKernel(mesh.Nelements, mesh.o_sgeo, diff --git a/solvers/bns/bns.hpp b/solvers/bns/bns.hpp index d47f3d8e0..125aec38f 100644 --- a/solvers/bns/bns.hpp +++ b/solvers/bns/bns.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -36,33 +36,35 @@ SOFTWARE. #define DBNS LIBP_DIR"/solvers/bns/" +using namespace libp; + class bnsSettings_t: public settings_t { public: - bnsSettings_t(MPI_Comm& _comm); + bnsSettings_t(comm_t& _comm); void report(); void parseFromFile(platformSettings_t& platformSettings, meshSettings_t& meshSettings, - const string filename); + const std::string filename); }; class bns_t: public solver_t { public: - mesh_t& mesh; + mesh_t mesh; int Nfields; int Npmlfields; - TimeStepper::timeStepper_t* timeStepper; + timeStepper_t timeStepper; - halo_t* traceHalo; - halo_t** multirateTraceHalo; + ogs::halo_t traceHalo; + memory multirateTraceHalo; dfloat RT, c, tauInv, Ma, Re, nu; // Flow parameters // Pml int pmlOrder; dfloat sigmaXmax, sigmaYmax, sigmaZmax; - dfloat *pmlSigma; + memory pmlSigma; dfloat pmlAlpha; // Flag for using cubature integration for sigma terms in pml @@ -71,38 +73,37 @@ class bns_t: public solver_t { // Flag for semi-analytic timestepping int semiAnalytic; - dfloat *q; - occa::memory o_q; + memory q; + deviceMemory o_q; - occa::memory o_Mq; + deviceMemory o_Mq; - dfloat *Vort, *VortMag; - occa::memory o_Vort, o_VortMag; + memory Vort, VortMag; + deviceMemory o_Vort, o_VortMag; - occa::memory o_pmlSigma; + deviceMemory o_pmlSigma; - occa::kernel volumeKernel; - occa::kernel surfaceKernel; - occa::kernel relaxationKernel; + kernel_t volumeKernel; + kernel_t surfaceKernel; + kernel_t relaxationKernel; - occa::kernel pmlVolumeKernel; - occa::kernel pmlSurfaceKernel; - occa::kernel pmlRelaxationKernel; + kernel_t pmlVolumeKernel; + kernel_t pmlSurfaceKernel; + kernel_t pmlRelaxationKernel; - occa::kernel vorticityKernel; + kernel_t vorticityKernel; - occa::kernel initialConditionKernel; + kernel_t initialConditionKernel; - bns_t() = delete; + bns_t() = default; bns_t(platform_t &_platform, mesh_t &_mesh, - bnsSettings_t& _settings): - solver_t(_platform, _settings), mesh(_mesh) {} - - ~bns_t(); + bnsSettings_t& _settings) { + Setup(_platform, _mesh, _settings); + } //setup - static bns_t& Setup(platform_t& platform, mesh_t& mesh, - bnsSettings_t& settings); + void Setup(platform_t& _platform, mesh_t& _mesh, + bnsSettings_t& _settings); void PmlSetup(); @@ -110,40 +111,40 @@ class bns_t: public solver_t { void Report(dfloat time, int tstep); - void PlotFields(dfloat* Q, dfloat* V, char *fileName); + void PlotFields(memory& Q, memory& V, std::string fileName); dfloat MaxWaveSpeed(); - void rhsf_pml(occa::memory& o_Q, occa::memory& o_pmlQ, - occa::memory& o_RHS, occa::memory& o_pmlRHS, const dfloat T); + void rhsf_pml(deviceMemory& o_Q, deviceMemory& o_pmlQ, + deviceMemory& o_RHS, deviceMemory& o_pmlRHS, const dfloat T); - void rhsf_MR_pml(occa::memory& o_Q, occa::memory& o_pmlQ, - occa::memory& o_RHS, occa::memory& o_pmlRHS, - occa::memory& o_fQM, const dfloat T, const int lev); + void rhsf_MR_pml(deviceMemory& o_Q, deviceMemory& o_pmlQ, + deviceMemory& o_RHS, deviceMemory& o_pmlRHS, + deviceMemory& o_fQM, const dfloat T, const int lev); //seperate components of rhs evaluation - void rhsVolume(dlong N, occa::memory& o_ids, - occa::memory& o_Q, occa::memory& o_RHS, const dfloat T); - void rhsPmlVolume(dlong N, occa::memory& o_ids, occa::memory& o_pmlids, - occa::memory& o_Q, occa::memory& o_pmlQ, - occa::memory& o_RHS, occa::memory& o_pmlRHS, const dfloat T); - void rhsRelaxation(dlong N, occa::memory& o_ids, - occa::memory& o_Q, occa::memory& o_RHS); - void rhsPmlRelaxation(dlong N, occa::memory& o_ids, occa::memory& o_pmlids, - occa::memory& o_Q, occa::memory& o_pmlQ, - occa::memory& o_RHS, occa::memory& o_pmlRHS); - void rhsSurface(dlong N, occa::memory& o_ids, - occa::memory& o_Q, occa::memory& o_RHS, const dfloat T); - void rhsPmlSurface(dlong N, occa::memory& o_ids, occa::memory& o_pmlids, - occa::memory& o_Q, occa::memory& o_pmlQ, - occa::memory& o_RHS, occa::memory& o_pmlRHS, const dfloat T); - void rhsSurfaceMR(dlong N, occa::memory& o_ids, - occa::memory& o_Q, occa::memory& o_RHS, - occa::memory& o_fQM, const dfloat T); - void rhsPmlSurfaceMR(dlong N, occa::memory& o_ids, occa::memory& o_pmlids, - occa::memory& o_Q, occa::memory& o_pmlQ, - occa::memory& o_RHS, occa::memory& o_pmlRHS, - occa::memory& o_fQM, const dfloat T); + void rhsVolume(dlong N, deviceMemory& o_ids, + deviceMemory& o_Q, deviceMemory& o_RHS, const dfloat T); + void rhsPmlVolume(dlong N, deviceMemory& o_ids, deviceMemory& o_pmlids, + deviceMemory& o_Q, deviceMemory& o_pmlQ, + deviceMemory& o_RHS, deviceMemory& o_pmlRHS, const dfloat T); + void rhsRelaxation(dlong N, deviceMemory& o_ids, + deviceMemory& o_Q, deviceMemory& o_RHS); + void rhsPmlRelaxation(dlong N, deviceMemory& o_ids, deviceMemory& o_pmlids, + deviceMemory& o_Q, deviceMemory& o_pmlQ, + deviceMemory& o_RHS, deviceMemory& o_pmlRHS); + void rhsSurface(dlong N, deviceMemory& o_ids, + deviceMemory& o_Q, deviceMemory& o_RHS, const dfloat T); + void rhsPmlSurface(dlong N, deviceMemory& o_ids, deviceMemory& o_pmlids, + deviceMemory& o_Q, deviceMemory& o_pmlQ, + deviceMemory& o_RHS, deviceMemory& o_pmlRHS, const dfloat T); + void rhsSurfaceMR(dlong N, deviceMemory& o_ids, + deviceMemory& o_Q, deviceMemory& o_RHS, + deviceMemory& o_fQM, const dfloat T); + void rhsPmlSurfaceMR(dlong N, deviceMemory& o_ids, deviceMemory& o_pmlids, + deviceMemory& o_Q, deviceMemory& o_pmlQ, + deviceMemory& o_RHS, deviceMemory& o_pmlRHS, + deviceMemory& o_fQM, const dfloat T); }; #endif @@ -154,7 +155,7 @@ class bns_t: public solver_t { dfloat isoMinVal, isoMaxVal, *isoLevels, *isoq; size_t isoMax; - occa::memory o_isoLevels, o_isoq, o_isoNtris; + deviceMemory o_isoLevels, o_isoq, o_isoNtris; // MRSAAB Coefficients dfloat *MRSAAB_A, *MRSAAB_B, *MRSAAB_C, *MRAB_A, *MRAB_B, *MRAB_C; @@ -164,7 +165,7 @@ class bns_t: public solver_t { int *isoGNlevels, isoGNgroups; dfloat **isoGLvalues; - occa::memory *o_isoGLvalues; + deviceMemory *o_isoGLvalues; // NBN: add storage for compacted isosurf data for gmsh write std::vector iso_nodes; diff --git a/solvers/bns/bnsMain.cpp b/solvers/bns/bnsMain.cpp index e2a16a0a7..e7e0e8c98 100644 --- a/solvers/bns/bnsMain.cpp +++ b/solvers/bns/bnsMain.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -29,39 +29,40 @@ SOFTWARE. int main(int argc, char **argv){ // start up MPI - MPI_Init(&argc, &argv); + Comm::Init(argc, argv); - MPI_Comm comm = MPI_COMM_WORLD; + LIBP_ABORT("Usage: ./bnsMain setupfile", argc!=2); - if(argc!=2) - LIBP_ABORT(string("Usage: ./bnsMain setupfile")); + { /*Scope so everything is destructed before MPI_Finalize */ + comm_t comm(Comm::World().Dup()); - //create default settings - platformSettings_t platformSettings(comm); - meshSettings_t meshSettings(comm); - bnsSettings_t bnsSettings(comm); + //create default settings + platformSettings_t platformSettings(comm); + meshSettings_t meshSettings(comm); + bnsSettings_t bnsSettings(comm); - //load settings from file - bnsSettings.parseFromFile(platformSettings, meshSettings, - argv[1]); + //load settings from file + bnsSettings.parseFromFile(platformSettings, meshSettings, + argv[1]); - // set up platform - platform_t platform(platformSettings); + // set up platform + platform_t platform(platformSettings); - platformSettings.report(); - meshSettings.report(); - bnsSettings.report(); + platformSettings.report(); + meshSettings.report(); + bnsSettings.report(); - // set up mesh - mesh_t& mesh = mesh_t::Setup(platform, meshSettings, comm); + // set up mesh + mesh_t mesh(platform, meshSettings, comm); - // set up bns solver - bns_t& bns = bns_t::Setup(platform, mesh, bnsSettings); + // set up bns solver + bns_t bns(platform, mesh, bnsSettings); - // run - bns.Run(); + // run + bns.Run(); + } // close down MPI - MPI_Finalize(); + Comm::Finalize(); return LIBP_SUCCESS; } diff --git a/solvers/bns/data/bnsGaussian2D.h b/solvers/bns/data/bnsGaussian2D.h index 70481a23d..0236e56d2 100644 --- a/solvers/bns/data/bnsGaussian2D.h +++ b/solvers/bns/data/bnsGaussian2D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/bns/data/bnsGaussian3D.h b/solvers/bns/data/bnsGaussian3D.h index 845468795..3dd35cdca 100644 --- a/solvers/bns/data/bnsGaussian3D.h +++ b/solvers/bns/data/bnsGaussian3D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/bns/data/bnsUniform2D.h b/solvers/bns/data/bnsUniform2D.h index 22e554f25..7e72719ce 100644 --- a/solvers/bns/data/bnsUniform2D.h +++ b/solvers/bns/data/bnsUniform2D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/bns/data/bnsUniform3D.h b/solvers/bns/data/bnsUniform3D.h index 416edc37f..3c5d8fb17 100644 --- a/solvers/bns/data/bnsUniform3D.h +++ b/solvers/bns/data/bnsUniform3D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/bns/makefile b/solvers/bns/makefile index d0758aa94..1d393db9e 100644 --- a/solvers/bns/makefile +++ b/solvers/bns/makefile @@ -2,7 +2,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal @@ -77,11 +77,8 @@ include ../../make.top endif endif -#gslib -GS_DIR=${LIBP_TPL_DIR}/gslib - #libraries -BNS_LIBP_LIBS=timeStepper mesh ogs linAlg core +BNS_LIBP_LIBS=timeStepper mesh parAdogs ogs linAlg core #includes INCLUDES=${LIBP_INCLUDES} \ @@ -91,11 +88,10 @@ DEFINES =${LIBP_DEFINES} \ -DLIBP_DIR='"${LIBP_DIR}"' #.cpp compilation flags -BNS_CXXFLAGS=${LIBP_MPICXXFLAGS} ${DEFINES} ${INCLUDES} +BNS_CXXFLAGS=${LIBP_CXXFLAGS} ${DEFINES} ${INCLUDES} #link libraries LIBS=-L${LIBP_LIBS_DIR} $(addprefix -l,$(BNS_LIBP_LIBS)) \ - -L$(GS_DIR)/lib -lgs \ ${LIBP_LIBS} #link flags @@ -143,10 +139,10 @@ endif # rule for .cpp files %.o: %.cpp $(DEPS) | libp_libs ifneq (,${verbose}) - $(LIBP_MPICXX) -o $*.o -c $*.cpp $(BNS_CXXFLAGS) + $(LIBP_CXX) -o $*.o -c $*.cpp $(BNS_CXXFLAGS) else @printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n"; - @$(LIBP_MPICXX) -o $*.o -c $*.cpp $(BNS_CXXFLAGS) + @$(LIBP_CXX) -o $*.o -c $*.cpp $(BNS_CXXFLAGS) endif #cleanup @@ -157,8 +153,7 @@ clean-libs: clean ${MAKE} -C ${LIBP_LIBS_DIR} clean clean-kernels: clean-libs -# $(shell ${OCCA_DIR}/bin/occa clear all -y) - rm -rf ~/.occa/ + rm -rf ${LIBP_DIR}/.occa/ realclean: clean ${MAKE} -C ${LIBP_LIBS_DIR} realclean diff --git a/solvers/bns/okl/bnsConstrainQuad3D.okl b/solvers/bns/okl/bnsConstrainQuad3D.okl index 2f46eb8e4..69d26598b 100644 --- a/solvers/bns/okl/bnsConstrainQuad3D.okl +++ b/solvers/bns/okl/bnsConstrainQuad3D.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,34 +26,34 @@ SOFTWARE. @kernel void bnsConstrainQuad3D(const dlong Nelements, - @restrict const dfloat * x, - @restrict const dfloat * y, - @restrict const dfloat * z, - @restrict dfloat * rhsq){ + @restrict const dfloat * x, + @restrict const dfloat * y, + @restrict const dfloat * z, + @restrict dfloat * rhsq){ for(dlong e=0;e& Q, memory& V, std::string fileName){ FILE *fp; - fp = fopen(fileName, "w"); + fp = fopen(fileName.c_str(), "w"); fprintf(fp, "\n"); fprintf(fp, " \n"); @@ -44,39 +44,45 @@ void bns_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){ fprintf(fp, " \n"); //scratch space for interpolation - size_t NscratchBytes = mymax(mesh.Np, mesh.plotNp)*sizeof(dfloat); - dfloat* scratch = (dfloat *) malloc(2*NscratchBytes); + size_t Nscratch = std::max(mesh.Np, mesh.plotNp); + memory scratch(2*Nscratch); - dfloat* Ix = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iy = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iz = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); + memory Ix(mesh.plotNp); + memory Iy(mesh.plotNp); + memory Iz(mesh.plotNp); // compute plot node coordinates on the fly for(dlong e=0;e\n"); fprintf(fp, " \n"); - free(Ix); free(Iy); free(Iz); - - dfloat* u = (dfloat *) malloc(mesh.Np*sizeof(dfloat)); - dfloat* v = (dfloat *) malloc(mesh.Np*sizeof(dfloat)); - dfloat* w = (dfloat *) malloc(mesh.Np*sizeof(dfloat)); + memory u(mesh.Np); + memory v(mesh.Np); + memory w(mesh.Np); - dfloat* Ip = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iu = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iv = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iw = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); + memory Ip(mesh.plotNp); + memory Iu(mesh.plotNp); + memory Iv(mesh.plotNp); + memory Iw(mesh.plotNp); - if (Q!=NULL) { + if (Q.length()!=0) { // write out density fprintf(fp, " \n"); fprintf(fp, " \n"); @@ -130,7 +136,7 @@ void bns_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){ fprintf(fp, " \n"); } - if (V!=NULL) { + if (V.length()!=0) { // write out vorticity if(mesh.dim==2){ fprintf(fp, " \n"); @@ -160,9 +166,6 @@ void bns_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){ } fprintf(fp, " \n"); - free(u); free(v); free(w); - free(Ip); free(Iu); free(Iv); free(Iw); - fprintf(fp, " \n"); fprintf(fp, " \n"); @@ -203,6 +206,4 @@ void bns_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){ fprintf(fp, " \n"); fprintf(fp, "\n"); fclose(fp); - - free(scratch); } diff --git a/solvers/bns/src/bnsPmlSetup.cpp b/solvers/bns/src/bnsPmlSetup.cpp index 38685c0f9..3d3f8197e 100644 --- a/solvers/bns/src/bnsPmlSetup.cpp +++ b/solvers/bns/src/bnsPmlSetup.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -108,7 +108,7 @@ void bns_t::PmlSetup(){ int pmlNp = (pmlcubature) ? mesh.cubNp : mesh.Np; int pmlNq = (pmlcubature) ? mesh.cubNq : mesh.Nq; - dfloat *pmlr, *pmls, *pmlt; + memory pmlr, pmls, pmlt; if(pmlcubature){ pmlr = mesh.cubr; pmls = mesh.cubs; @@ -121,27 +121,27 @@ void bns_t::PmlSetup(){ // printf("Setting PML Coefficient \n"); //set up damping parameter - pmlSigma = (dfloat *) calloc(mesh.dim*mesh.NpmlElements*pmlNp,sizeof(dfloat)); + pmlSigma.malloc(mesh.dim*mesh.NpmlElements*pmlNp, 0.0); for (dlong m=0;m xe = mesh.EX + e*mesh.Nverts; + memory ye = mesh.EY + e*mesh.Nverts; + memory ze = mesh.EZ + e*mesh.Nverts; for(int n=0;n(pmlSigma); } } diff --git a/solvers/bns/src/bnsReport.cpp b/solvers/bns/src/bnsReport.cpp index 6dae7648c..68e17099b 100644 --- a/solvers/bns/src/bnsReport.cpp +++ b/solvers/bns/src/bnsReport.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -37,7 +37,7 @@ void bns_t::Report(dfloat time, int tstep){ mesh.MassMatrixApply(o_q, o_Mq); dlong Nentries = mesh.Nelements*mesh.Np*Nfields; - dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_q, o_Mq, mesh.comm)); + dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_q, o_Mq, mesh.comm)); if(mesh.rank==0) printf("%5.2f (%d), %5.2f (time, timestep, norm)\n", time, tstep, norm2); @@ -49,12 +49,12 @@ void bns_t::Report(dfloat time, int tstep){ o_Vort.copyTo(Vort); // output field files - string name; + std::string name; settings.getSetting("OUTPUT FILE NAME", name); char fname[BUFSIZ]; sprintf(fname, "%s_%04d_%04d.vtu", name.c_str(), mesh.rank, frame++); - PlotFields(q, Vort, fname); + PlotFields(q, Vort, std::string(fname)); } /* diff --git a/solvers/bns/src/bnsRun.cpp b/solvers/bns/src/bnsRun.cpp index 84ae062e6..342f520dd 100644 --- a/solvers/bns/src/bnsRun.cpp +++ b/solvers/bns/src/bnsRun.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -51,7 +51,7 @@ void bns_t::Run(){ dfloat dtAdv = hmin/(vmax*(mesh.N+1.)*(mesh.N+1.)); dfloat dtVisc = 1.0/tauInv; - dfloat dt = (semiAnalytic) ? cfl*dtAdv : cfl*mymin(dtAdv, dtVisc); + dfloat dt = (semiAnalytic) ? cfl*dtAdv : cfl*std::min(dtAdv, dtVisc); /* Artificial warping of time step size for multirate testing */ @@ -60,9 +60,9 @@ void bns_t::Run(){ settings.compareSetting("TIME INTEGRATOR","MRSAAB3")) dt /= (1<<(mesh.mrNlevels-1)); #endif - timeStepper->SetTimeStep(dt); + timeStepper.SetTimeStep(dt); - timeStepper->Run(o_q, startTime, finalTime); + timeStepper.Run(*this, o_q, startTime, finalTime); // output norm of final solution { @@ -70,7 +70,7 @@ void bns_t::Run(){ mesh.MassMatrixApply(o_q, o_Mq); dlong Nentries = mesh.Nelements*mesh.Np*Nfields; - dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_q, o_Mq, mesh.comm)); + dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_q, o_Mq, mesh.comm)); if(mesh.rank==0) printf("Solution norm = %17.15lg\n", norm2); diff --git a/solvers/bns/src/bnsSettings.cpp b/solvers/bns/src/bnsSettings.cpp index 6fc2cd10f..cff7fe8e8 100644 --- a/solvers/bns/src/bnsSettings.cpp +++ b/solvers/bns/src/bnsSettings.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,7 +27,7 @@ SOFTWARE. #include "bns.hpp" //settings for bns solver -bnsSettings_t::bnsSettings_t(MPI_Comm& _comm): +bnsSettings_t::bnsSettings_t(comm_t& _comm): settings_t(_comm) { newSetting("DATA FILE", @@ -95,10 +95,7 @@ bnsSettings_t::bnsSettings_t(MPI_Comm& _comm): void bnsSettings_t::report() { - int rank; - MPI_Comm_rank(comm, &rank); - - if (rank==0) { + if (comm.rank()==0) { std::cout << "BNS Settings:\n\n"; reportSetting("DATA FILE"); reportSetting("SPEED OF SOUND"); @@ -119,15 +116,15 @@ void bnsSettings_t::report() { void bnsSettings_t::parseFromFile(platformSettings_t& platformSettings, meshSettings_t& meshSettings, - const string filename) { + const std::string filename) { //read all settings from file settings_t s(comm); s.readSettingsFromFile(filename); for(auto it = s.settings.begin(); it != s.settings.end(); ++it) { - setting_t* set = it->second; - const string name = set->getName(); - const string val = set->getVal(); + setting_t& set = it->second; + const std::string name = set.getName(); + const std::string val = set.getVal(); if (platformSettings.hasSetting(name)) platformSettings.changeSetting(name, val); else if (meshSettings.hasSetting(name)) @@ -135,9 +132,7 @@ void bnsSettings_t::parseFromFile(platformSettings_t& platformSettings, else if (hasSetting(name)) //self changeSetting(name, val); else { - stringstream ss; - ss << "Unknown setting: [" << name << "] requested"; - LIBP_ABORT(ss.str()); + LIBP_FORCE_ABORT("Unknown setting: [" << name << "] requested"); } } -} \ No newline at end of file +} diff --git a/solvers/bns/src/bnsSetup.cpp b/solvers/bns/src/bnsSetup.cpp index 8831c5e32..842c54104 100644 --- a/solvers/bns/src/bnsSetup.cpp +++ b/solvers/bns/src/bnsSetup.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,54 +26,60 @@ SOFTWARE. #include "bns.hpp" -bns_t& bns_t::Setup(platform_t& platform, mesh_t& mesh, - bnsSettings_t& settings){ +void bns_t::Setup(platform_t& _platform, mesh_t& _mesh, + bnsSettings_t& _settings){ - bns_t* bns = new bns_t(platform, mesh, settings); + platform = _platform; + mesh = _mesh; + comm = _mesh.comm; + settings = _settings; //get physical paramters - settings.getSetting("SPEED OF SOUND", bns->c); - settings.getSetting("VISCOSITY", bns->nu); - bns->RT = bns->c*bns->c; - bns->tauInv = bns->RT/bns->nu; + settings.getSetting("SPEED OF SOUND", c); + settings.getSetting("VISCOSITY", nu); + RT = c*c; + tauInv = RT/nu; - bns->Nfields = (mesh.dim==3) ? 10:6; - bns->Npmlfields = mesh.dim*bns->Nfields; + Nfields = (mesh.dim==3) ? 10:6; + Npmlfields = mesh.dim*Nfields; + + //Trigger JIT kernel builds + ogs::InitializeKernels(platform, ogs::Dfloat, ogs::Add); //setup cubature mesh.CubatureSetup(); //Setup PML - bns->PmlSetup(); + PmlSetup(); //setup timeStepper - dlong Nlocal = mesh.Nelements*mesh.Np*bns->Nfields; - dlong Nhalo = mesh.totalHaloPairs*mesh.Np*bns->Nfields; + dlong Nlocal = mesh.Nelements*mesh.Np*Nfields; + dlong Nhalo = mesh.totalHaloPairs*mesh.Np*Nfields; - bns->semiAnalytic = 0; + semiAnalytic = 0; if (settings.compareSetting("TIME INTEGRATOR","SARK4") ||settings.compareSetting("TIME INTEGRATOR","SARK5") ||settings.compareSetting("TIME INTEGRATOR","SAAB3") ||settings.compareSetting("TIME INTEGRATOR","MRSAAB3")) - bns->semiAnalytic = 1; + semiAnalytic = 1; //semi-analytic exponential coefficients - dfloat lambda[bns->Nfields]; + memory lambda(Nfields); for (int i=0;iNfields;i++) lambda[i] = -bns->tauInv; + for (int i=mesh.dim+1;iMaxWaveSpeed(); + memory EtoDT(mesh.Nelements); + dfloat vmax = MaxWaveSpeed(); for(dlong e=0;etauInv; + dfloat dtVisc = 1.0/tauInv; - if (bns->semiAnalytic) + if (semiAnalytic) EtoDT[e] = dtAdv; else - EtoDT[e] = mymin(dtAdv, dtVisc); + EtoDT[e] = std::min(dtAdv, dtVisc); /* Artificial warping of time step size for multirate testing @@ -94,7 +100,7 @@ bns_t& bns_t::Setup(platform_t& platform, mesh_t& mesh, if (mesh.dim==3) c = mymin(c,fabs(z)); - c = mymax(0.5, c); + c = std::max(0.5, c); EtoDT[e] *= c; #endif } @@ -104,183 +110,179 @@ bns_t& bns_t::Setup(platform_t& platform, mesh_t& mesh, settings.compareSetting("TIME INTEGRATOR","MRSAAB3")) { mesh.MultiRateSetup(EtoDT); mesh.MultiRatePmlSetup(); - bns->multirateTraceHalo = mesh.MultiRateHaloTraceSetup(bns->Nfields); + multirateTraceHalo = mesh.MultiRateHaloTraceSetup(Nfields); } if (settings.compareSetting("TIME INTEGRATOR","MRAB3")){ - bns->timeStepper = new TimeStepper::mrab3_pml(mesh.Nelements, mesh.NpmlElements, mesh.totalHaloPairs, - mesh.Np, bns->Nfields, bns->Npmlfields, *bns, mesh); + timeStepper.Setup(mesh.Nelements, mesh.NpmlElements, + mesh.totalHaloPairs, + mesh.Np, Nfields, Npmlfields, + platform, mesh); } else if (settings.compareSetting("TIME INTEGRATOR","MRSAAB3")){ - bns->timeStepper = new TimeStepper::mrsaab3_pml(mesh.Nelements, mesh.NpmlElements, mesh.totalHaloPairs, - mesh.Np, bns->Nfields, bns->Npmlfields, lambda, *bns, mesh); + timeStepper.Setup(mesh.Nelements, mesh.NpmlElements, + mesh.totalHaloPairs, + mesh.Np, Nfields, Npmlfields, + lambda, platform, mesh); } else if (settings.compareSetting("TIME INTEGRATOR","SAAB3")) { - bns->timeStepper = new TimeStepper::saab3_pml(mesh.Nelements, mesh.NpmlElements, mesh.totalHaloPairs, - mesh.Np, bns->Nfields, bns->Npmlfields, lambda, *bns); + timeStepper.Setup(mesh.Nelements, mesh.NpmlElements, + mesh.totalHaloPairs, + mesh.Np, Nfields, Npmlfields, + lambda, platform, comm); } else if (settings.compareSetting("TIME INTEGRATOR","AB3")){ - bns->timeStepper = new TimeStepper::ab3_pml(mesh.Nelements, mesh.NpmlElements, mesh.totalHaloPairs, - mesh.Np, bns->Nfields, bns->Npmlfields, *bns); + timeStepper.Setup(mesh.Nelements, mesh.NpmlElements, + mesh.totalHaloPairs, + mesh.Np, Nfields, Npmlfields, + platform, comm); } else if (settings.compareSetting("TIME INTEGRATOR","LSERK4")){ - bns->timeStepper = new TimeStepper::lserk4_pml(mesh.Nelements, mesh.NpmlElements, mesh.totalHaloPairs, - mesh.Np, bns->Nfields, bns->Npmlfields, *bns); + timeStepper.Setup(mesh.Nelements, mesh.NpmlElements, + mesh.totalHaloPairs, + mesh.Np, Nfields, Npmlfields, + platform, comm); } else if (settings.compareSetting("TIME INTEGRATOR","DOPRI5")){ - bns->timeStepper = new TimeStepper::dopri5_pml(mesh.Nelements, mesh.NpmlElements, mesh.totalHaloPairs, - mesh.Np, bns->Nfields, bns->Npmlfields, *bns, mesh.comm); + timeStepper.Setup(mesh.Nelements, mesh.NpmlElements, + mesh.totalHaloPairs, + mesh.Np, Nfields, Npmlfields, + platform, comm); } else if (settings.compareSetting("TIME INTEGRATOR","SARK4")) { - bns->timeStepper = new TimeStepper::sark4_pml(mesh.Nelements, mesh.NpmlElements, mesh.totalHaloPairs, - mesh.Np, bns->Nfields, bns->Npmlfields, lambda, *bns, mesh.comm); + timeStepper.Setup(mesh.Nelements, mesh.NpmlElements, + mesh.totalHaloPairs, + mesh.Np, Nfields, Npmlfields, + lambda, platform, comm); } else if (settings.compareSetting("TIME INTEGRATOR","SARK5")) { - bns->timeStepper = new TimeStepper::sark5_pml(mesh.Nelements, mesh.NpmlElements, mesh.totalHaloPairs, - mesh.Np, bns->Nfields, bns->Npmlfields, lambda, *bns, mesh.comm); + timeStepper.Setup(mesh.Nelements, mesh.NpmlElements, + mesh.totalHaloPairs, + mesh.Np, Nfields, Npmlfields, + lambda, platform, comm); } else { - LIBP_ABORT(string("Requested TIME INTEGRATOR not found.")); + LIBP_FORCE_ABORT("Requested TIME INTEGRATOR not found."); } - free(EtoDT); //setup linear algebra module - platform.linAlg.InitKernels({"innerProd"}); + platform.linAlg().InitKernels({"innerProd"}); /*setup trace halo exchange */ - bns->traceHalo = mesh.HaloTraceSetup(bns->Nfields); + traceHalo = mesh.HaloTraceSetup(Nfields); // compute samples of q at interpolation nodes - bns->q = (dfloat*) calloc(Nlocal+Nhalo, sizeof(dfloat)); - bns->o_q = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), bns->q); + q.malloc(Nlocal+Nhalo, 0.0); + o_q = platform.malloc(q); - bns->Vort = (dfloat*) calloc(mesh.dim*mesh.Nelements*mesh.Np, sizeof(dfloat)); - bns->o_Vort = platform.malloc((mesh.dim*mesh.Nelements*mesh.Np)*sizeof(dfloat), - bns->Vort); + Vort.malloc(mesh.dim*mesh.Nelements*mesh.Np, 0.0); + o_Vort = platform.malloc(Vort); //storage for M*q during reporting - bns->o_Mq = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), bns->q); - mesh.MassMatrixKernelSetup(bns->Nfields); // mass matrix operator + o_Mq = platform.malloc(q); + mesh.MassMatrixKernelSetup(Nfields); // mass matrix operator // OCCA build stuff - occa::properties kernelInfo = mesh.props; //copy base occa properties + properties_t kernelInfo = mesh.props; //copy base occa properties //add boundary data to kernel info - string dataFileName; + std::string dataFileName; settings.getSetting("DATA FILE", dataFileName); kernelInfo["includes"] += dataFileName; - kernelInfo["defines/" "p_Nfields"]= bns->Nfields; - kernelInfo["defines/" "p_Npmlfields"]= bns->Npmlfields; + kernelInfo["defines/" "p_Nfields"]= Nfields; + kernelInfo["defines/" "p_Npmlfields"]= Npmlfields; - int maxNodes = mymax(mesh.Np, (mesh.Nfp*mesh.Nfaces)); + int maxNodes = std::max(mesh.Np, (mesh.Nfp*mesh.Nfaces)); kernelInfo["defines/" "p_maxNodes"]= maxNodes; int blockMax = 256; if (platform.device.mode()=="CUDA") blockMax = 512; - int NblockV = mymax(1, blockMax/mesh.Np); + int NblockV = std::max(1, blockMax/mesh.Np); kernelInfo["defines/" "p_NblockV"]= NblockV; - int NblockS = mymax(1, blockMax/maxNodes); + int NblockS = std::max(1, blockMax/maxNodes); kernelInfo["defines/" "p_NblockS"]= NblockS; - int NblockCub = mymax(1, blockMax/mesh.cubNp); + int NblockCub = std::max(1, blockMax/mesh.cubNp); kernelInfo["defines/" "p_NblockCub"]= NblockCub; - kernelInfo["parser/" "automate-add-barriers"] = "disabled"; - // set kernel name suffix - char *suffix; - if(mesh.elementType==TRIANGLES) - suffix = strdup("Tri2D"); - if(mesh.elementType==QUADRILATERALS) - suffix = strdup("Quad2D"); - if(mesh.elementType==TETRAHEDRA) - suffix = strdup("Tet3D"); - if(mesh.elementType==HEXAHEDRA) - suffix = strdup("Hex3D"); - - char fileName[BUFSIZ], kernelName[BUFSIZ]; + std::string suffix; + if(mesh.elementType==Mesh::TRIANGLES) + suffix = "Tri2D"; + if(mesh.elementType==Mesh::QUADRILATERALS) + suffix = "Quad2D"; + if(mesh.elementType==Mesh::TETRAHEDRA) + suffix = "Tet3D"; + if(mesh.elementType==Mesh::HEXAHEDRA) + suffix = "Hex3D"; + + std::string oklFilePrefix = DBNS "/okl/"; + std::string oklFileSuffix = ".okl"; + + std::string fileName, kernelName; // kernels from volume file - sprintf(fileName, DBNS "/okl/bnsVolume%s.okl", suffix); - sprintf(kernelName, "bnsVolume%s", suffix); - bns->volumeKernel = platform.buildKernel(fileName, kernelName, + fileName = oklFilePrefix + "bnsVolume" + suffix + oklFileSuffix; + kernelName = "bnsVolume" + suffix; + volumeKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - if (bns->pmlcubature) { - sprintf(kernelName, "bnsPmlVolumeCub%s", suffix); - bns->pmlVolumeKernel = platform.buildKernel(fileName, kernelName, + if (pmlcubature) { + kernelName = "bnsPmlVolumeCub" + suffix; + pmlVolumeKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } else { - sprintf(kernelName, "bnsPmlVolume%s", suffix); - bns->pmlVolumeKernel = platform.buildKernel(fileName, kernelName, + kernelName = "bnsPmlVolume" + suffix; + pmlVolumeKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } // kernels from relaxation file - sprintf(fileName, DBNS "/okl/bnsRelaxation%s.okl", suffix); - sprintf(kernelName, "bnsRelaxation%s", suffix); - bns->relaxationKernel = platform.buildKernel(fileName, kernelName, + fileName = oklFilePrefix + "bnsRelaxation" + suffix + oklFileSuffix; + kernelName = "bnsRelaxation" + suffix; + relaxationKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - if (bns->pmlcubature) { - sprintf(kernelName, "bnsPmlRelaxationCub%s", suffix); - bns->pmlRelaxationKernel = platform.buildKernel(fileName, kernelName, + if (pmlcubature) { + kernelName = "bnsPmlRelaxationCub" + suffix; + pmlRelaxationKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } else { - bns->pmlRelaxationKernel = platform.buildKernel(fileName, kernelName, + pmlRelaxationKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } // kernels from surface file - sprintf(fileName, DBNS "/okl/bnsSurface%s.okl", suffix); + fileName = oklFilePrefix + "bnsSurface" + suffix + oklFileSuffix; if (settings.compareSetting("TIME INTEGRATOR","MRAB3") || settings.compareSetting("TIME INTEGRATOR","MRSAAB3")) { - sprintf(kernelName, "bnsMRSurface%s", suffix); - bns->surfaceKernel = platform.buildKernel(fileName, kernelName, + kernelName = "bnsMRSurface" + suffix; + surfaceKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(kernelName, "bnsMRPmlSurface%s", suffix); - bns->pmlSurfaceKernel = platform.buildKernel(fileName, kernelName, + kernelName = "bnsMRPmlSurface" + suffix; + pmlSurfaceKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } else { - sprintf(kernelName, "bnsSurface%s", suffix); - bns->surfaceKernel = platform.buildKernel(fileName, kernelName, + kernelName = "bnsSurface" + suffix; + surfaceKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(kernelName, "bnsPmlSurface%s", suffix); - bns->pmlSurfaceKernel = platform.buildKernel(fileName, kernelName, + kernelName = "bnsPmlSurface" + suffix; + pmlSurfaceKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } // vorticity calculation - sprintf(fileName, DBNS "/okl/bnsVorticity%s.okl", suffix); - sprintf(kernelName, "bnsVorticity%s", suffix); + fileName = oklFilePrefix + "bnsVorticity" + suffix + oklFileSuffix; + kernelName = "bnsVorticity" + suffix; - bns->vorticityKernel = platform.buildKernel(fileName, kernelName, + vorticityKernel = platform.buildKernel(fileName, kernelName, kernelInfo); if (mesh.dim==2) { - sprintf(fileName, DBNS "/okl/bnsInitialCondition2D.okl"); - sprintf(kernelName, "bnsInitialCondition2D"); + fileName = oklFilePrefix + "bnsInitialCondition2D" + oklFileSuffix; + kernelName = "bnsInitialCondition2D"; } else { - sprintf(fileName, DBNS "/okl/bnsInitialCondition3D.okl"); - sprintf(kernelName, "bnsInitialCondition3D"); + fileName = oklFilePrefix + "bnsInitialCondition3D" + oklFileSuffix; + kernelName = "bnsInitialCondition3D"; } - bns->initialConditionKernel = platform.buildKernel(fileName, kernelName, + initialConditionKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - - return *bns; } - -bns_t::~bns_t() { - volumeKernel.free(); - surfaceKernel.free(); - relaxationKernel.free(); - pmlVolumeKernel.free(); - pmlSurfaceKernel.free(); - pmlRelaxationKernel.free(); - vorticityKernel.free(); - initialConditionKernel.free(); - - if (timeStepper) delete timeStepper; - if (traceHalo) traceHalo->Free(); - - for (int lev=0;levFree(); -} \ No newline at end of file diff --git a/solvers/bns/src/bnsStep.cpp b/solvers/bns/src/bnsStep.cpp index 40a6fa611..cfd9eb34c 100644 --- a/solvers/bns/src/bnsStep.cpp +++ b/solvers/bns/src/bnsStep.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -32,11 +32,11 @@ dfloat bns_t::MaxWaveSpeed(){ } //evaluate ODE rhs = f(q,t) -void bns_t::rhsf_pml(occa::memory& o_Q, occa::memory& o_pmlQ, - occa::memory& o_RHS, occa::memory& o_pmlRHS, const dfloat T){ +void bns_t::rhsf_pml(deviceMemory& o_Q, deviceMemory& o_pmlQ, + deviceMemory& o_RHS, deviceMemory& o_pmlRHS, const dfloat T){ // extract q trace halo and start exchange - traceHalo->ExchangeStart(o_Q, 1, ogs_dfloat); + traceHalo.ExchangeStart(o_Q, 1); // compute volume contribution to bns RHS rhsVolume(mesh.NnonPmlElements, mesh.o_nonPmlElements, o_Q, o_RHS, T); @@ -49,7 +49,7 @@ void bns_t::rhsf_pml(occa::memory& o_Q, occa::memory& o_pmlQ, o_Q, o_pmlQ, o_RHS, o_pmlRHS); // complete trace halo exchange - traceHalo->ExchangeFinish(o_Q, 1, ogs_dfloat); + traceHalo.ExchangeFinish(o_Q, 1); // compute surface contribution to bns RHS rhsSurface(mesh.NnonPmlElements, mesh.o_nonPmlElements, o_Q, o_RHS, T); @@ -59,12 +59,12 @@ void bns_t::rhsf_pml(occa::memory& o_Q, occa::memory& o_pmlQ, //evaluate ODE rhs = f(q,t) -void bns_t::rhsf_MR_pml(occa::memory& o_Q, occa::memory& o_pmlQ, - occa::memory& o_RHS, occa::memory& o_pmlRHS, - occa::memory& o_fQM, const dfloat T, const int lev){ +void bns_t::rhsf_MR_pml(deviceMemory& o_Q, deviceMemory& o_pmlQ, + deviceMemory& o_RHS, deviceMemory& o_pmlRHS, + deviceMemory& o_fQM, const dfloat T, const int lev){ // extract q trace halo and start exchange - multirateTraceHalo[lev]->ExchangeStart(o_fQM, 1, ogs_dfloat); + multirateTraceHalo[lev].ExchangeStart(o_fQM, 1); // compute volume contribution to bns RHS rhsVolume(mesh.mrNnonPmlElements[lev], mesh.o_mrNonPmlElements[lev], o_Q, o_RHS, T); @@ -77,7 +77,7 @@ void bns_t::rhsf_MR_pml(occa::memory& o_Q, occa::memory& o_pmlQ, o_Q, o_pmlQ, o_RHS, o_pmlRHS); // complete trace halo exchange - multirateTraceHalo[lev]->ExchangeFinish(o_fQM, 1, ogs_dfloat); + multirateTraceHalo[lev].ExchangeFinish(o_fQM, 1); // compute surface contribution to bns RHS rhsSurfaceMR(mesh.mrNnonPmlElements[lev], mesh.o_mrNonPmlElements[lev], o_Q, o_RHS, o_fQM, T); @@ -85,8 +85,8 @@ void bns_t::rhsf_MR_pml(occa::memory& o_Q, occa::memory& o_pmlQ, o_Q, o_pmlQ, o_RHS, o_pmlRHS, o_fQM, T); } -void bns_t::rhsVolume(dlong N, occa::memory& o_ids, - occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){ +void bns_t::rhsVolume(dlong N, deviceMemory& o_ids, + deviceMemory& o_Q, deviceMemory& o_RHS, const dfloat T){ // compute volume contribution to bns RHS if (N) @@ -104,9 +104,9 @@ void bns_t::rhsVolume(dlong N, occa::memory& o_ids, o_RHS); } -void bns_t::rhsPmlVolume(dlong N, occa::memory& o_ids, occa::memory& o_pmlids, - occa::memory& o_Q, occa::memory& o_pmlQ, - occa::memory& o_RHS, occa::memory& o_pmlRHS, const dfloat T){ +void bns_t::rhsPmlVolume(dlong N, deviceMemory& o_ids, deviceMemory& o_pmlids, + deviceMemory& o_Q, deviceMemory& o_pmlQ, + deviceMemory& o_RHS, deviceMemory& o_pmlRHS, const dfloat T){ // compute volume contribution to bns RHS if (N) { @@ -146,8 +146,8 @@ void bns_t::rhsPmlVolume(dlong N, occa::memory& o_ids, occa::memory& o_pmlids, } } -void bns_t::rhsRelaxation(dlong N, occa::memory& o_ids, - occa::memory& o_Q, occa::memory& o_RHS){ +void bns_t::rhsRelaxation(dlong N, deviceMemory& o_ids, + deviceMemory& o_Q, deviceMemory& o_RHS){ // compute volume contribution to bns RHS if (N) @@ -163,9 +163,9 @@ void bns_t::rhsRelaxation(dlong N, occa::memory& o_ids, o_RHS); } -void bns_t::rhsPmlRelaxation(dlong N, occa::memory& o_ids, occa::memory& o_pmlids, - occa::memory& o_Q, occa::memory& o_pmlQ, - occa::memory& o_RHS, occa::memory& o_pmlRHS){ +void bns_t::rhsPmlRelaxation(dlong N, deviceMemory& o_ids, deviceMemory& o_pmlids, + deviceMemory& o_Q, deviceMemory& o_pmlQ, + deviceMemory& o_RHS, deviceMemory& o_pmlRHS){ // compute volume contribution to bns RHS if (N) { @@ -199,8 +199,8 @@ void bns_t::rhsPmlRelaxation(dlong N, occa::memory& o_ids, occa::memory& o_pmlid } } -void bns_t::rhsSurface(dlong N, occa::memory& o_ids, - occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){ +void bns_t::rhsSurface(dlong N, deviceMemory& o_ids, + deviceMemory& o_Q, deviceMemory& o_RHS, const dfloat T){ // compute volume contribution to bns RHS if (N) @@ -221,9 +221,9 @@ void bns_t::rhsSurface(dlong N, occa::memory& o_ids, o_RHS); } -void bns_t::rhsPmlSurface(dlong N, occa::memory& o_ids, occa::memory& o_pmlids, - occa::memory& o_Q, occa::memory& o_pmlQ, - occa::memory& o_RHS, occa::memory& o_pmlRHS, const dfloat T){ +void bns_t::rhsPmlSurface(dlong N, deviceMemory& o_ids, deviceMemory& o_pmlids, + deviceMemory& o_Q, deviceMemory& o_pmlQ, + deviceMemory& o_RHS, deviceMemory& o_pmlRHS, const dfloat T){ // compute volume contribution to bns RHS if (N) @@ -246,9 +246,9 @@ void bns_t::rhsPmlSurface(dlong N, occa::memory& o_ids, occa::memory& o_pmlids, o_pmlRHS); } -void bns_t::rhsSurfaceMR(dlong N, occa::memory& o_ids, - occa::memory& o_Q, occa::memory& o_RHS, - occa::memory& o_fQM, const dfloat T){ +void bns_t::rhsSurfaceMR(dlong N, deviceMemory& o_ids, + deviceMemory& o_Q, deviceMemory& o_RHS, + deviceMemory& o_fQM, const dfloat T){ // compute volume contribution to bns RHS if (N) @@ -270,10 +270,10 @@ void bns_t::rhsSurfaceMR(dlong N, occa::memory& o_ids, o_RHS); } -void bns_t::rhsPmlSurfaceMR(dlong N, occa::memory& o_ids, occa::memory& o_pmlids, - occa::memory& o_Q, occa::memory& o_pmlQ, - occa::memory& o_RHS, occa::memory& o_pmlRHS, - occa::memory& o_fQM, const dfloat T){ +void bns_t::rhsPmlSurfaceMR(dlong N, deviceMemory& o_ids, deviceMemory& o_pmlids, + deviceMemory& o_Q, deviceMemory& o_pmlQ, + deviceMemory& o_RHS, deviceMemory& o_pmlRHS, + deviceMemory& o_fQM, const dfloat T){ // compute volume contribution to bns RHS if (N) diff --git a/solvers/cns/cns.hpp b/solvers/cns/cns.hpp index 92c949faf..ae54e9a3f 100644 --- a/solvers/cns/cns.hpp +++ b/solvers/cns/cns.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -36,18 +36,20 @@ SOFTWARE. #define DCNS LIBP_DIR"/solvers/cns/" +using namespace libp; + class cnsSettings_t: public settings_t { public: - cnsSettings_t(MPI_Comm& _comm); + cnsSettings_t(comm_t _comm); void report(); void parseFromFile(platformSettings_t& platformSettings, meshSettings_t& meshSettings, - const string filename); + const std::string filename); }; class cns_t: public solver_t { public: - mesh_t &mesh; + mesh_t mesh; int Nfields; int Ngrads; @@ -58,57 +60,56 @@ class cns_t: public solver_t { int cubature; int isothermal; - TimeStepper::timeStepper_t* timeStepper; + timeStepper_t timeStepper; - halo_t* fieldTraceHalo; - halo_t* gradTraceHalo; + ogs::halo_t fieldTraceHalo; + ogs::halo_t gradTraceHalo; - dfloat *q; - occa::memory o_q; + memory q; + deviceMemory o_q; - dfloat *gradq; - occa::memory o_gradq; + memory gradq; + deviceMemory o_gradq; - dfloat *Vort; - occa::memory o_Vort; + memory Vort; + deviceMemory o_Vort; - occa::memory o_Mq; + deviceMemory o_Mq; - occa::kernel volumeKernel; - occa::kernel surfaceKernel; - occa::kernel cubatureVolumeKernel; - occa::kernel cubatureSurfaceKernel; + kernel_t volumeKernel; + kernel_t surfaceKernel; + kernel_t cubatureVolumeKernel; + kernel_t cubatureSurfaceKernel; - occa::kernel gradVolumeKernel; - occa::kernel gradSurfaceKernel; + kernel_t gradVolumeKernel; + kernel_t gradSurfaceKernel; - occa::kernel vorticityKernel; + kernel_t vorticityKernel; - occa::kernel constrainKernel; + kernel_t constrainKernel; - occa::kernel initialConditionKernel; - occa::kernel maxWaveSpeedKernel; + kernel_t initialConditionKernel; + kernel_t maxWaveSpeedKernel; - cns_t() = delete; + cns_t() = default; cns_t(platform_t &_platform, mesh_t &_mesh, - cnsSettings_t& _settings): - solver_t(_platform, _settings), mesh(_mesh) {} - - ~cns_t(); + cnsSettings_t& _settings) { + Setup(_platform, _mesh, _settings); + } //setup - static cns_t& Setup(platform_t& platform, mesh_t& mesh, - cnsSettings_t& settings); + void Setup(platform_t& _platform, mesh_t& _mesh, + cnsSettings_t& _settings); void Run(); - void Report(dfloat time, int tstep); + void Report(dfloat time, int tstep) override; - void PlotFields(dfloat* Q, dfloat *V, char *fileName); + void PlotFields(memory Q, memory V, std::string fileName); - void rhsf(occa::memory& o_q, occa::memory& o_rhs, const dfloat time); + void rhsf(deviceMemory& o_q, deviceMemory& o_rhs, const dfloat time); - dfloat MaxWaveSpeed(occa::memory& o_Q, const dfloat T); + dfloat MaxWaveSpeed(deviceMemory& o_Q, const dfloat T); }; #endif diff --git a/solvers/cns/cnsMain.cpp b/solvers/cns/cnsMain.cpp index 9fc888216..d028762fa 100644 --- a/solvers/cns/cnsMain.cpp +++ b/solvers/cns/cnsMain.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -29,39 +29,40 @@ SOFTWARE. int main(int argc, char **argv){ // start up MPI - MPI_Init(&argc, &argv); + Comm::Init(argc, argv); - MPI_Comm comm = MPI_COMM_WORLD; + LIBP_ABORT("Usage: ./cnsMain setupfile", argc!=2); - if(argc!=2) - LIBP_ABORT(string("Usage: ./cnsMain setupfile")); + { /*Scope so everything is destructed before MPI_Finalize */ + comm_t comm(Comm::World().Dup()); - //create default settings - platformSettings_t platformSettings(comm); - meshSettings_t meshSettings(comm); - cnsSettings_t cnsSettings(comm); + //create default settings + platformSettings_t platformSettings(comm); + meshSettings_t meshSettings(comm); + cnsSettings_t cnsSettings(comm); - //load settings from file - cnsSettings.parseFromFile(platformSettings, meshSettings, - argv[1]); + //load settings from file + cnsSettings.parseFromFile(platformSettings, meshSettings, + argv[1]); - // set up platform - platform_t platform(platformSettings); + // set up platform + platform_t platform(platformSettings); - platformSettings.report(); - meshSettings.report(); - cnsSettings.report(); + platformSettings.report(); + meshSettings.report(); + cnsSettings.report(); - // set up mesh - mesh_t& mesh = mesh_t::Setup(platform, meshSettings, comm); + // set up mesh + mesh_t mesh(platform, meshSettings, comm); - // set up cns solver - cns_t& cns = cns_t::Setup(platform, mesh, cnsSettings); + // set up cns solver + cns_t cns(platform, mesh, cnsSettings); - // run - cns.Run(); + // run + cns.Run(); + } // close down MPI - MPI_Finalize(); + Comm::Finalize(); return LIBP_SUCCESS; } diff --git a/solvers/cns/data/cnsGaussian2D.h b/solvers/cns/data/cnsGaussian2D.h index 48cf506d1..bc247cfe6 100644 --- a/solvers/cns/data/cnsGaussian2D.h +++ b/solvers/cns/data/cnsGaussian2D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/cns/data/cnsGaussian3D.h b/solvers/cns/data/cnsGaussian3D.h index 1920900c5..74417fd32 100644 --- a/solvers/cns/data/cnsGaussian3D.h +++ b/solvers/cns/data/cnsGaussian3D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/cns/data/cnsUniform2D.h b/solvers/cns/data/cnsUniform2D.h index 9123fedee..05a37ec0c 100644 --- a/solvers/cns/data/cnsUniform2D.h +++ b/solvers/cns/data/cnsUniform2D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/cns/data/cnsUniform3D.h b/solvers/cns/data/cnsUniform3D.h index 0100058f1..15de1588f 100644 --- a/solvers/cns/data/cnsUniform3D.h +++ b/solvers/cns/data/cnsUniform3D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/cns/data/cnsVortexDipole2D.h b/solvers/cns/data/cnsVortexDipole2D.h index 0e31fdd50..cef94933d 100644 --- a/solvers/cns/data/cnsVortexDipole2D.h +++ b/solvers/cns/data/cnsVortexDipole2D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/cns/makefile b/solvers/cns/makefile index 09a369c09..3fbe22f42 100644 --- a/solvers/cns/makefile +++ b/solvers/cns/makefile @@ -2,7 +2,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal @@ -78,11 +78,8 @@ include ../../make.top endif endif -#gslib -GS_DIR=${LIBP_TPL_DIR}/gslib - #libraries -CNS_LIBP_LIBS=timeStepper mesh ogs linAlg core +CNS_LIBP_LIBS=timeStepper mesh parAdogs ogs linAlg core #includes INCLUDES=${LIBP_INCLUDES} \ @@ -93,11 +90,10 @@ DEFINES =${LIBP_DEFINES} \ -DLIBP_DIR='"${LIBP_DIR}"' #.cpp compilation flags -CNS_CXXFLAGS=${LIBP_MPICXXFLAGS} ${DEFINES} ${INCLUDES} +CNS_CXXFLAGS=${LIBP_CXXFLAGS} ${DEFINES} ${INCLUDES} #link libraries LIBS=-L${LIBP_LIBS_DIR} $(addprefix -l,$(CNS_LIBP_LIBS)) \ - -L$(GS_DIR)/lib -lgs \ ${LIBP_LIBS} #link flags @@ -145,10 +141,10 @@ endif # rule for .cpp files %.o: %.cpp $(DEPS) | libp_libs ifneq (,${verbose}) - $(LIBP_MPICXX) -o $*.o -c $*.cpp $(CNS_CXXFLAGS) + $(LIBP_CXX) -o $*.o -c $*.cpp $(CNS_CXXFLAGS) else @printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n"; - @$(LIBP_MPICXX) -o $*.o -c $*.cpp $(CNS_CXXFLAGS) + @$(LIBP_CXX) -o $*.o -c $*.cpp $(CNS_CXXFLAGS) endif #cleanup @@ -159,8 +155,7 @@ clean-libs: clean ${MAKE} -C ${LIBP_LIBS_DIR} clean clean-kernels: clean-libs -# $(shell ${OCCA_DIR}/bin/occa clear all -y) - rm -rf ~/.occa/ + rm -rf ${LIBP_DIR}/.occa/ realclean: clean ${MAKE} -C ${LIBP_LIBS_DIR} realclean diff --git a/solvers/cns/okl/cnsConstrainQuad3D.okl b/solvers/cns/okl/cnsConstrainQuad3D.okl index fa78d6ca5..b99a6dd3b 100644 --- a/solvers/cns/okl/cnsConstrainQuad3D.okl +++ b/solvers/cns/okl/cnsConstrainQuad3D.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,34 +26,34 @@ SOFTWARE. @kernel void cnsConstrainQuad3D(const dlong Nelements, - @restrict const dfloat * x, - @restrict const dfloat * y, - @restrict const dfloat * z, - @restrict dfloat * rhsq){ + @restrict const dfloat * x, + @restrict const dfloat * y, + @restrict const dfloat * z, + @restrict dfloat * rhsq){ for(dlong e=0;e Q, memory V, std::string fileName){ FILE *fp; - fp = fopen(fileName, "w"); + fp = fopen(fileName.c_str(), "w"); fprintf(fp, "\n"); fprintf(fp, " \n"); @@ -44,40 +44,46 @@ void cns_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){ fprintf(fp, " \n"); //scratch space for interpolation - size_t NscratchBytes = mymax(mesh.Np, mesh.plotNp)*sizeof(dfloat); - dfloat* scratch = (dfloat *) malloc(2*NscratchBytes); + size_t Nscratch = std::max(mesh.Np, mesh.plotNp); + memory scratch(2*Nscratch); - dfloat* Ix = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iy = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iz = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); + memory Ix(mesh.plotNp); + memory Iy(mesh.plotNp); + memory Iz(mesh.plotNp); // compute plot node coordinates on the fly for(dlong e=0;e\n"); fprintf(fp, " \n"); - free(Ix); free(Iy); free(Iz); - - dfloat* p = (dfloat *) malloc(mesh.Np*sizeof(dfloat)); - dfloat* u = (dfloat *) malloc(mesh.Np*sizeof(dfloat)); - dfloat* v = (dfloat *) malloc(mesh.Np*sizeof(dfloat)); - dfloat* w = (dfloat *) malloc(mesh.Np*sizeof(dfloat)); + memory p(mesh.Np); + memory u(mesh.Np); + memory v(mesh.Np); + memory w(mesh.Np); - dfloat* Ip = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iu = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iv = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iw = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); + memory Ip(mesh.plotNp); + memory Iu(mesh.plotNp); + memory Iv(mesh.plotNp); + memory Iw(mesh.plotNp); - if (Q!=NULL) { + if (Q.length()!=0) { // write out density fprintf(fp, " \n"); fprintf(fp, " \n"); @@ -144,7 +150,7 @@ void cns_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){ } } - if (V!=NULL) { + if (V.length()!=0) { // write out vorticity if(mesh.dim==2){ fprintf(fp, " \n"); @@ -175,9 +181,6 @@ void cns_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){ fprintf(fp, " \n"); - free(p); free(u); free(v); free(w); - free(Ip); free(Iu); free(Iv); free(Iw); - fprintf(fp, " \n"); fprintf(fp, " \n"); @@ -218,6 +221,4 @@ void cns_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){ fprintf(fp, " \n"); fprintf(fp, "\n"); fclose(fp); - - free(scratch); } diff --git a/solvers/cns/src/cnsReport.cpp b/solvers/cns/src/cnsReport.cpp index b196b20af..474fb69aa 100644 --- a/solvers/cns/src/cnsReport.cpp +++ b/solvers/cns/src/cnsReport.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -37,7 +37,7 @@ void cns_t::Report(dfloat time, int tstep){ mesh.MassMatrixApply(o_q, o_Mq); dlong Nentries = mesh.Nelements*mesh.Np*Nfields; - dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_q, o_Mq, mesh.comm)); + dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_q, o_Mq, mesh.comm)); if(mesh.rank==0) printf("%5.2f (%d), %5.2f (time, timestep, norm)\n", time, tstep, norm2); @@ -49,11 +49,11 @@ void cns_t::Report(dfloat time, int tstep){ o_Vort.copyTo(Vort); // output field files - string name; + std::string name; settings.getSetting("OUTPUT FILE NAME", name); char fname[BUFSIZ]; sprintf(fname, "%s_%04d_%04d.vtu", name.c_str(), mesh.rank, frame++); - PlotFields(q, Vort, fname); + PlotFields(q, Vort, std::string(fname)); } } diff --git a/solvers/cns/src/cnsRun.cpp b/solvers/cns/src/cnsRun.cpp index b7e70dae3..5fa52e93d 100644 --- a/solvers/cns/src/cnsRun.cpp +++ b/solvers/cns/src/cnsRun.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) - Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -51,10 +51,10 @@ void cns_t::Run(){ dfloat dtAdv = cfl/(vmax*(mesh.N+1.)*(mesh.N+1.)); dfloat dtVisc = cfl*pow(hmin, 2)/(pow(mesh.N+1,4)*mu); - dfloat dt = mymin(dtAdv, dtVisc); - timeStepper->SetTimeStep(dt); + dfloat dt = std::min(dtAdv, dtVisc); + timeStepper.SetTimeStep(dt); - timeStepper->Run(o_q, startTime, finalTime); + timeStepper.Run(*this, o_q, startTime, finalTime); // output norm of final solution { @@ -62,7 +62,7 @@ void cns_t::Run(){ mesh.MassMatrixApply(o_q, o_Mq); dlong Nentries = mesh.Nelements*mesh.Np*Nfields; - dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_q, o_Mq, mesh.comm)); + dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_q, o_Mq, mesh.comm)); if(mesh.rank==0) printf("Solution norm = %17.15lg\n", norm2); diff --git a/solvers/cns/src/cnsSettings.cpp b/solvers/cns/src/cnsSettings.cpp index 1d239a867..628d79f22 100644 --- a/solvers/cns/src/cnsSettings.cpp +++ b/solvers/cns/src/cnsSettings.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,7 +27,7 @@ SOFTWARE. #include "cns.hpp" //settings for cns solver -cnsSettings_t::cnsSettings_t(MPI_Comm& _comm): +cnsSettings_t::cnsSettings_t(comm_t _comm): settings_t(_comm) { newSetting("DATA FILE", @@ -84,10 +84,7 @@ cnsSettings_t::cnsSettings_t(MPI_Comm& _comm): void cnsSettings_t::report() { - int rank; - MPI_Comm_rank(comm, &rank); - - if (rank==0) { + if (comm.rank()==0) { std::cout << "CNS Settings:\n\n"; reportSetting("DATA FILE"); @@ -106,15 +103,15 @@ void cnsSettings_t::report() { void cnsSettings_t::parseFromFile(platformSettings_t& platformSettings, meshSettings_t& meshSettings, - const string filename) { + const std::string filename) { //read all settings from file settings_t s(comm); s.readSettingsFromFile(filename); for(auto it = s.settings.begin(); it != s.settings.end(); ++it) { - setting_t* set = it->second; - const string name = set->getName(); - const string val = set->getVal(); + setting_t& set = it->second; + const std::string name = set.getName(); + const std::string val = set.getVal(); if (platformSettings.hasSetting(name)) platformSettings.changeSetting(name, val); else if (meshSettings.hasSetting(name)) @@ -122,9 +119,7 @@ void cnsSettings_t::parseFromFile(platformSettings_t& platformSettings, else if (hasSetting(name)) //self changeSetting(name, val); else { - stringstream ss; - ss << "Unknown setting: [" << name << "] requested"; - LIBP_ABORT(ss.str()); + LIBP_FORCE_ABORT("Unknown setting: [" << name << "] requested"); } } -} \ No newline at end of file +} diff --git a/solvers/cns/src/cnsSetup.cpp b/solvers/cns/src/cnsSetup.cpp index a7254037b..9d7d3a8d8 100644 --- a/solvers/cns/src/cnsSetup.cpp +++ b/solvers/cns/src/cnsSetup.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,243 +26,230 @@ SOFTWARE. #include "cns.hpp" -cns_t& cns_t::Setup(platform_t& platform, mesh_t& mesh, - cnsSettings_t& settings){ +void cns_t::Setup(platform_t& _platform, mesh_t& _mesh, + cnsSettings_t& _settings){ - cns_t* cns = new cns_t(platform, mesh, settings); + platform = _platform; + mesh = _mesh; + comm = _mesh.comm; + settings = _settings; + + //Trigger JIT kernel builds + ogs::InitializeKernels(platform, ogs::Dfloat, ogs::Add); //get physical paramters - settings.getSetting("VISCOSITY", cns->mu); - settings.getSetting("GAMMA", cns->gamma); + settings.getSetting("VISCOSITY", mu); + settings.getSetting("GAMMA", gamma); - cns->cubature = (settings.compareSetting("ADVECTION TYPE", "CUBATURE")) ? 1:0; - cns->isothermal = (settings.compareSetting("ISOTHERMAL", "TRUE")) ? 1:0; + cubature = (settings.compareSetting("ADVECTION TYPE", "CUBATURE")) ? 1:0; + isothermal = (settings.compareSetting("ISOTHERMAL", "TRUE")) ? 1:0; //setup cubature - if (cns->cubature) { + if (cubature) { mesh.CubatureSetup(); - mesh.CubatureNodes(); + mesh.CubaturePhysicalNodes(); } - cns->Nfields = (mesh.dim==3) ? 4:3; - cns->Ngrads = mesh.dim*mesh.dim; + Nfields = (mesh.dim==3) ? 4:3; + Ngrads = mesh.dim*mesh.dim; - if (!cns->isothermal) cns->Nfields++; //include energy equation + if (!isothermal) Nfields++; //include energy equation - dlong NlocalFields = mesh.Nelements*mesh.Np*cns->Nfields; - dlong NhaloFields = mesh.totalHaloPairs*mesh.Np*cns->Nfields; - dlong NlocalGrads = mesh.Nelements*mesh.Np*cns->Ngrads; - dlong NhaloGrads = mesh.totalHaloPairs*mesh.Np*cns->Ngrads; + dlong NlocalFields = mesh.Nelements*mesh.Np*Nfields; + dlong NhaloFields = mesh.totalHaloPairs*mesh.Np*Nfields; + dlong NlocalGrads = mesh.Nelements*mesh.Np*Ngrads; + dlong NhaloGrads = mesh.totalHaloPairs*mesh.Np*Ngrads; //setup timeStepper if (settings.compareSetting("TIME INTEGRATOR","AB3")){ - cns->timeStepper = new TimeStepper::ab3(mesh.Nelements, mesh.totalHaloPairs, - mesh.Np, cns->Nfields, *cns); + timeStepper.Setup(mesh.Nelements, + mesh.totalHaloPairs, + mesh.Np, Nfields, platform, comm); } else if (settings.compareSetting("TIME INTEGRATOR","LSERK4")){ - cns->timeStepper = new TimeStepper::lserk4(mesh.Nelements, mesh.totalHaloPairs, - mesh.Np, cns->Nfields, *cns); + timeStepper.Setup(mesh.Nelements, + mesh.totalHaloPairs, + mesh.Np, Nfields, platform, comm); } else if (settings.compareSetting("TIME INTEGRATOR","DOPRI5")){ - cns->timeStepper = new TimeStepper::dopri5(mesh.Nelements, mesh.totalHaloPairs, - mesh.Np, cns->Nfields, *cns, mesh.comm); + timeStepper.Setup(mesh.Nelements, + mesh.totalHaloPairs, + mesh.Np, Nfields, platform, comm); } //setup linear algebra module - platform.linAlg.InitKernels({"innerProd", "max"}); + platform.linAlg().InitKernels({"innerProd", "max"}); /*setup trace halo exchange */ - cns->fieldTraceHalo = mesh.HaloTraceSetup(cns->Nfields); - cns->gradTraceHalo = mesh.HaloTraceSetup(cns->Ngrads); + fieldTraceHalo = mesh.HaloTraceSetup(Nfields); + gradTraceHalo = mesh.HaloTraceSetup(Ngrads); // compute samples of q at interpolation nodes - cns->q = (dfloat*) calloc(NlocalFields+NhaloFields, sizeof(dfloat)); - cns->o_q = platform.malloc((NlocalFields+NhaloFields)*sizeof(dfloat), - cns->q); + q.malloc(NlocalFields+NhaloFields); + o_q = platform.malloc(q); - cns->gradq = (dfloat*) calloc(NlocalGrads+NhaloGrads, sizeof(dfloat)); - cns->o_gradq = platform.malloc((NlocalGrads+NhaloGrads)*sizeof(dfloat), - cns->gradq); + gradq.malloc(NlocalGrads+NhaloGrads); + o_gradq = platform.malloc(gradq); - cns->Vort = (dfloat*) calloc(mesh.dim*mesh.Nelements*mesh.Np, sizeof(dfloat)); - cns->o_Vort = platform.malloc((mesh.dim*mesh.Nelements*mesh.Np)*sizeof(dfloat), - cns->Vort); + Vort.malloc(mesh.dim*mesh.Nelements*mesh.Np); + o_Vort = platform.malloc(Vort); //storage for M*q during reporting - cns->o_Mq = platform.malloc((NlocalFields+NhaloFields)*sizeof(dfloat), cns->q); - mesh.MassMatrixKernelSetup(cns->Nfields); // mass matrix operator + o_Mq = platform.malloc(q); + mesh.MassMatrixKernelSetup(Nfields); // mass matrix operator // OCCA build stuff - occa::properties kernelInfo = mesh.props; //copy base occa properties + properties_t kernelInfo = mesh.props; //copy base occa properties //add boundary data to kernel info - string dataFileName; + std::string dataFileName; settings.getSetting("DATA FILE", dataFileName); kernelInfo["includes"] += dataFileName; - kernelInfo["defines/" "p_Nfields"]= cns->Nfields; - kernelInfo["defines/" "p_Ngrads"]= cns->Ngrads; + kernelInfo["defines/" "p_Nfields"]= Nfields; + kernelInfo["defines/" "p_Ngrads"]= Ngrads; - int maxNodes = mymax(mesh.Np, (mesh.Nfp*mesh.Nfaces)); + int maxNodes = std::max(mesh.Np, (mesh.Nfp*mesh.Nfaces)); kernelInfo["defines/" "p_maxNodes"]= maxNodes; int blockMax = 256; if (platform.device.mode() == "CUDA") blockMax = 512; - int NblockV = mymax(1, blockMax/mesh.Np); + int NblockV = std::max(1, blockMax/mesh.Np); kernelInfo["defines/" "p_NblockV"]= NblockV; - int NblockS = mymax(1, blockMax/maxNodes); + int NblockS = std::max(1, blockMax/maxNodes); kernelInfo["defines/" "p_NblockS"]= NblockS; - if (cns->cubature) { - int cubMaxNodes = mymax(mesh.Np, (mesh.intNfp*mesh.Nfaces)); + if (cubature) { + int cubMaxNodes = std::max(mesh.Np, (mesh.intNfp*mesh.Nfaces)); kernelInfo["defines/" "p_cubMaxNodes"]= cubMaxNodes; - int cubMaxNodes1 = mymax(mesh.Np, (mesh.intNfp)); + int cubMaxNodes1 = std::max(mesh.Np, (mesh.intNfp)); kernelInfo["defines/" "p_cubMaxNodes1"]= cubMaxNodes1; - int cubNblockV = mymax(1, blockMax/mesh.cubNp); + int cubNblockV = std::max(1, blockMax/mesh.cubNp); kernelInfo["defines/" "p_cubNblockV"]= cubNblockV; - int cubNblockS = mymax(1, blockMax/cubMaxNodes); + int cubNblockS = std::max(1, blockMax/cubMaxNodes); kernelInfo["defines/" "p_cubNblockS"]= cubNblockS; } - kernelInfo["parser/" "automate-add-barriers"] = "disabled"; - // set kernel name suffix - char *suffix; - if(mesh.elementType==TRIANGLES) - suffix = strdup("Tri2D"); - if(mesh.elementType==QUADRILATERALS) - suffix = strdup("Quad2D"); - if(mesh.elementType==TETRAHEDRA) - suffix = strdup("Tet3D"); - if(mesh.elementType==HEXAHEDRA) - suffix = strdup("Hex3D"); - - char fileName[BUFSIZ], kernelName[BUFSIZ]; - - if (cns->isothermal) { - if (cns->cubature) { + std::string suffix; + if(mesh.elementType==Mesh::TRIANGLES) + suffix = "Tri2D"; + if(mesh.elementType==Mesh::QUADRILATERALS) + suffix = "Quad2D"; + if(mesh.elementType==Mesh::TETRAHEDRA) + suffix = "Tet3D"; + if(mesh.elementType==Mesh::HEXAHEDRA) + suffix = "Hex3D"; + + std::string oklFilePrefix = DCNS "/okl/"; + std::string oklFileSuffix = ".okl"; + + std::string fileName, kernelName; + + if (isothermal) { + if (cubature) { // kernels from volume file - sprintf(fileName, DCNS "/okl/cnsIsothermalCubatureVolume%s.okl", suffix); - sprintf(kernelName, "cnsIsothermalCubatureVolume%s", suffix); + fileName = oklFilePrefix + "cnsIsothermalCubatureVolume" + suffix + oklFileSuffix; + kernelName = "cnsIsothermalCubatureVolume" + suffix; - cns->cubatureVolumeKernel = platform.buildKernel(fileName, kernelName, + cubatureVolumeKernel = platform.buildKernel(fileName, kernelName, kernelInfo); // kernels from surface file - sprintf(fileName, DCNS "/okl/cnsIsothermalCubatureSurface%s.okl", suffix); - sprintf(kernelName, "cnsIsothermalCubatureSurface%s", suffix); + fileName = oklFilePrefix + "cnsIsothermalCubatureSurface" + suffix + oklFileSuffix; + kernelName = "cnsIsothermalCubatureSurface" + suffix; - cns->cubatureSurfaceKernel = platform.buildKernel(fileName, kernelName, + cubatureSurfaceKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } else { // kernels from volume file - sprintf(fileName, DCNS "/okl/cnsIsothermalVolume%s.okl", suffix); - sprintf(kernelName, "cnsIsothermalVolume%s", suffix); + fileName = oklFilePrefix + "cnsIsothermalVolume" + suffix + oklFileSuffix; + kernelName = "cnsIsothermalVolume" + suffix; - cns->volumeKernel = platform.buildKernel(fileName, kernelName, + volumeKernel = platform.buildKernel(fileName, kernelName, kernelInfo); // kernels from surface file - sprintf(fileName, DCNS "/okl/cnsIsothermalSurface%s.okl", suffix); - sprintf(kernelName, "cnsIsothermalSurface%s", suffix); + fileName = oklFilePrefix + "cnsIsothermalSurface" + suffix + oklFileSuffix; + kernelName = "cnsIsothermalSurface" + suffix; - cns->surfaceKernel = platform.buildKernel(fileName, kernelName, + surfaceKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } } else { - if (cns->cubature) { + if (cubature) { // kernels from volume file - sprintf(fileName, DCNS "/okl/cnsCubatureVolume%s.okl", suffix); - sprintf(kernelName, "cnsCubatureVolume%s", suffix); + fileName = oklFilePrefix + "cnsCubatureVolume" + suffix + oklFileSuffix; + kernelName = "cnsCubatureVolume" + suffix; - cns->cubatureVolumeKernel = platform.buildKernel(fileName, kernelName, + cubatureVolumeKernel = platform.buildKernel(fileName, kernelName, kernelInfo); // kernels from surface file - sprintf(fileName, DCNS "/okl/cnsCubatureSurface%s.okl", suffix); - sprintf(kernelName, "cnsCubatureSurface%s", suffix); + fileName = oklFilePrefix + "cnsCubatureSurface" + suffix + oklFileSuffix; + kernelName = "cnsCubatureSurface" + suffix; - cns->cubatureSurfaceKernel = platform.buildKernel(fileName, kernelName, + cubatureSurfaceKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } else { // kernels from volume file - sprintf(fileName, DCNS "/okl/cnsVolume%s.okl", suffix); - sprintf(kernelName, "cnsVolume%s", suffix); + fileName = oklFilePrefix + "cnsVolume" + suffix + oklFileSuffix; + kernelName = "cnsVolume" + suffix; - cns->volumeKernel = platform.buildKernel(fileName, kernelName, + volumeKernel = platform.buildKernel(fileName, kernelName, kernelInfo); // kernels from surface file - sprintf(fileName, DCNS "/okl/cnsSurface%s.okl", suffix); - sprintf(kernelName, "cnsSurface%s", suffix); + fileName = oklFilePrefix + "cnsSurface" + suffix + oklFileSuffix; + kernelName = "cnsSurface" + suffix; - cns->surfaceKernel = platform.buildKernel(fileName, kernelName, + surfaceKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } } // kernels from volume file - sprintf(fileName, DCNS "/okl/cnsGradVolume%s.okl", suffix); - sprintf(kernelName, "cnsGradVolume%s", suffix); + fileName = oklFilePrefix + "cnsGradVolume" + suffix + oklFileSuffix; + kernelName = "cnsGradVolume" + suffix; - cns->gradVolumeKernel = platform.buildKernel(fileName, kernelName, + gradVolumeKernel = platform.buildKernel(fileName, kernelName, kernelInfo); // kernels from surface file - sprintf(fileName, DCNS "/okl/cnsGradSurface%s.okl", suffix); - sprintf(kernelName, "cnsGradSurface%s", suffix); + fileName = oklFilePrefix + "cnsGradSurface" + suffix + oklFileSuffix; + kernelName = "cnsGradSurface" + suffix; - cns->gradSurfaceKernel = platform.buildKernel(fileName, kernelName, + gradSurfaceKernel = platform.buildKernel(fileName, kernelName, kernelInfo); // vorticity calculation - sprintf(fileName, DCNS "/okl/cnsVorticity%s.okl", suffix); - sprintf(kernelName, "cnsVorticity%s", suffix); + fileName = oklFilePrefix + "cnsVorticity" + suffix + oklFileSuffix; + kernelName = "cnsVorticity" + suffix; - cns->vorticityKernel = platform.buildKernel(fileName, kernelName, + vorticityKernel = platform.buildKernel(fileName, kernelName, kernelInfo); if (mesh.dim==2) { - sprintf(fileName, DCNS "/okl/cnsInitialCondition2D.okl"); - if (cns->isothermal) - sprintf(kernelName, "cnsIsothermalInitialCondition2D"); + fileName = oklFilePrefix + "cnsInitialCondition2D" + oklFileSuffix; + if (isothermal) + kernelName = "cnsIsothermalInitialCondition2D"; else - sprintf(kernelName, "cnsInitialCondition2D"); + kernelName = "cnsInitialCondition2D"; } else { - sprintf(fileName, DCNS "/okl/cnsInitialCondition3D.okl"); - if (cns->isothermal) - sprintf(kernelName, "cnsIsothermalInitialCondition3D"); + fileName = oklFilePrefix + "cnsInitialCondition3D" + oklFileSuffix; + if (isothermal) + kernelName = "cnsIsothermalInitialCondition3D"; else - sprintf(kernelName, "cnsInitialCondition3D"); + kernelName = "cnsInitialCondition3D"; } - - cns->initialConditionKernel = platform.buildKernel(fileName, kernelName, + initialConditionKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(fileName, DCNS "/okl/cnsMaxWaveSpeed%s.okl", suffix); - if (cns->isothermal) { - sprintf(kernelName, "cnsIsothermalMaxWaveSpeed%s", suffix); + fileName = oklFilePrefix + "cnsMaxWaveSpeed" + suffix + oklFileSuffix; + if (isothermal) { + kernelName = "cnsIsothermalMaxWaveSpeed" + suffix; } else { - sprintf(kernelName, "cnsMaxWaveSpeed%s", suffix); + kernelName = "cnsMaxWaveSpeed" + suffix; } - cns->maxWaveSpeedKernel = platform.buildKernel(fileName, kernelName, + maxWaveSpeedKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - - return *cns; -} - -cns_t::~cns_t() { - volumeKernel.free(); - surfaceKernel.free(); - cubatureVolumeKernel.free(); - cubatureSurfaceKernel.free(); - gradVolumeKernel.free(); - gradSurfaceKernel.free(); - vorticityKernel.free(); - constrainKernel.free(); - initialConditionKernel.free(); - maxWaveSpeedKernel.free(); - - if (timeStepper) delete timeStepper; - if (fieldTraceHalo) fieldTraceHalo->Free(); - if (gradTraceHalo) gradTraceHalo->Free(); } diff --git a/solvers/cns/src/cnsStep.cpp b/solvers/cns/src/cnsStep.cpp index c0f35a403..1544320a2 100644 --- a/solvers/cns/src/cnsStep.cpp +++ b/solvers/cns/src/cnsStep.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,10 +26,10 @@ SOFTWARE. #include "cns.hpp" -dfloat cns_t::MaxWaveSpeed(occa::memory& o_Q, const dfloat T){ +dfloat cns_t::MaxWaveSpeed(deviceMemory& o_Q, const dfloat T){ //Note: if this is on the critical path in the future, we should pre-allocate this - occa::memory o_maxSpeed = platform.malloc(mesh.Nelements*sizeof(dfloat)); + deviceMemory o_maxSpeed = platform.malloc(mesh.Nelements); maxWaveSpeedKernel(mesh.Nelements, mesh.o_vgeo, @@ -45,17 +45,16 @@ dfloat cns_t::MaxWaveSpeed(occa::memory& o_Q, const dfloat T){ o_Q, o_maxSpeed); - const dfloat vmax = platform.linAlg.max(mesh.Nelements, o_maxSpeed, mesh.comm); + const dfloat vmax = platform.linAlg().max(mesh.Nelements, o_maxSpeed, mesh.comm); - o_maxSpeed.free(); return vmax; } //evaluate ODE rhs = f(q,t) -void cns_t::rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){ +void cns_t::rhsf(deviceMemory& o_Q, deviceMemory& o_RHS, const dfloat T){ // extract q trace halo and start exchange - fieldTraceHalo->ExchangeStart(o_Q, 1, ogs_dfloat); + fieldTraceHalo.ExchangeStart(o_Q, 1); // compute volume contributions to gradients gradVolumeKernel(mesh.Nelements, @@ -65,7 +64,7 @@ void cns_t::rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){ o_gradq); // complete trace halo exchange - fieldTraceHalo->ExchangeFinish(o_Q, 1, ogs_dfloat); + fieldTraceHalo.ExchangeFinish(o_Q, 1); // compute surface contributions to gradients gradSurfaceKernel(mesh.Nelements, @@ -84,7 +83,7 @@ void cns_t::rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){ o_gradq); // extract viscousStresses trace halo and start exchange - gradTraceHalo->ExchangeStart(o_gradq, 1, ogs_dfloat); + gradTraceHalo.ExchangeStart(o_gradq, 1); // compute volume contribution to cns RHS if (cubature) { @@ -120,7 +119,7 @@ void cns_t::rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){ } // complete trace halo exchange - gradTraceHalo->ExchangeFinish(o_gradq, 1, ogs_dfloat); + gradTraceHalo.ExchangeFinish(o_gradq, 1); if (cubature) { cubatureSurfaceKernel(mesh.Nelements, diff --git a/solvers/elliptic/data/ellipticBoundary2D.h b/solvers/elliptic/data/ellipticBoundary2D.h index dcd3a716e..0dcbb475d 100644 --- a/solvers/elliptic/data/ellipticBoundary2D.h +++ b/solvers/elliptic/data/ellipticBoundary2D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/elliptic/data/ellipticBoundary3D.h b/solvers/elliptic/data/ellipticBoundary3D.h index 722794907..1d4bfcdd3 100644 --- a/solvers/elliptic/data/ellipticBoundary3D.h +++ b/solvers/elliptic/data/ellipticBoundary3D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/elliptic/data/ellipticHomogeneous2D.h b/solvers/elliptic/data/ellipticHomogeneous2D.h index 15b4c8079..259525e9b 100644 --- a/solvers/elliptic/data/ellipticHomogeneous2D.h +++ b/solvers/elliptic/data/ellipticHomogeneous2D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/elliptic/data/ellipticHomogeneous3D.h b/solvers/elliptic/data/ellipticHomogeneous3D.h index 36a684078..603ccc2f0 100644 --- a/solvers/elliptic/data/ellipticHomogeneous3D.h +++ b/solvers/elliptic/data/ellipticHomogeneous3D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/elliptic/data/ellipticSine2D.h b/solvers/elliptic/data/ellipticSine2D.h index a170b990a..820575d65 100644 --- a/solvers/elliptic/data/ellipticSine2D.h +++ b/solvers/elliptic/data/ellipticSine2D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/elliptic/data/ellipticSine3D.h b/solvers/elliptic/data/ellipticSine3D.h index 574c786de..342c627a5 100644 --- a/solvers/elliptic/data/ellipticSine3D.h +++ b/solvers/elliptic/data/ellipticSine3D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/elliptic/elliptic.hpp b/solvers/elliptic/elliptic.hpp index 60533aa57..96ba5b972 100644 --- a/solvers/elliptic/elliptic.hpp +++ b/solvers/elliptic/elliptic.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -38,22 +38,24 @@ SOFTWARE. #define DELLIPTIC LIBP_DIR"/solvers/elliptic/" +using namespace libp; + class ellipticSettings_t: public settings_t { public: - ellipticSettings_t(const MPI_Comm& _comm); + ellipticSettings_t() = default; + ellipticSettings_t(const comm_t& _comm); void report(); void parseFromFile(platformSettings_t& platformSettings, meshSettings_t& meshSettings, - const string filename); + const std::string filename); }; void ellipticAddRunSettings(settings_t& settings); void ellipticAddSettings(settings_t& settings, - const string prefix=""); + const std::string prefix=""); class elliptic_t: public solver_t { public: - mesh_t &mesh; - linAlg_t &linAlg; + mesh_t mesh; dlong Ndofs, Nhalo; int Nfields; @@ -63,66 +65,69 @@ class elliptic_t: public solver_t { int disc_ipdg, disc_c0; - occa::memory o_AqL; + deviceMemory o_AqL; - halo_t* traceHalo; + ogs::halo_t traceHalo; - precon_t* precon; + precon_t precon; - dfloat *grad; - occa::memory o_grad; + memory grad; + deviceMemory o_grad; - dfloat *weight, *weightG; - occa::memory o_weight, o_weightG; + memory weight, weightG; + deviceMemory o_weight, o_weightG; //C0-FEM mask data - ogs_t *ogsMasked; - int *mapB; // boundary flag of face nodes + ogs::ogs_t ogsMasked; + ogs::halo_t gHalo; + memory mapB; // boundary flag of face nodes + deviceMemory o_mapB; dlong Nmasked; - dlong *maskIds; - hlong *maskedGlobalIds; - hlong *maskedGlobalNumbering; + memory maskIds; + memory maskedGlobalIds; + memory maskedGlobalNumbering; + memory GlobalToLocal; - occa::memory o_maskIds; - occa::memory o_mapB; + deviceMemory o_maskIds; + deviceMemory o_GlobalToLocal; - int *BCType; - int *EToB; - occa::memory o_EToB; + int NBCTypes; + memory BCType; + memory EToB; + deviceMemory o_EToB; int allNeumann; dfloat allNeumannPenalty; dfloat allNeumannScale; - occa::kernel maskKernel; - occa::kernel partialAxKernel; - occa::kernel partialGradientKernel; - occa::kernel partialIpdgKernel; + kernel_t maskKernel; + kernel_t partialAxKernel; + kernel_t partialGradientKernel; + kernel_t partialIpdgKernel; - elliptic_t() = delete; + elliptic_t() = default; elliptic_t(platform_t &_platform, mesh_t &_mesh, - settings_t& _settings, dfloat _lambda): - solver_t(_platform, _settings), mesh(_mesh), - linAlg(_platform.linAlg), lambda(_lambda) {} - - ~elliptic_t(); + settings_t& _settings, dfloat _lambda, + const int _NBCTypes, const memory _BCType) { + Setup(_platform, _mesh, _settings, _lambda, _NBCTypes, _BCType); + } //setup - static elliptic_t& Setup(platform_t& platform, mesh_t& mesh, - ellipticSettings_t& settings, dfloat lambda, - const int NBCTypes, const int *BCType); + void Setup(platform_t& _platform, mesh_t& _mesh, + settings_t& _settings, dfloat _lambda, + const int _NBCTypes, const memory _BCType); void BoundarySetup(); void Run(); - int Solve(linearSolver_t& linearSolver, occa::memory &o_x, occa::memory &o_r, + int Solve(linearSolver_t& linearSolver, deviceMemory &o_x, deviceMemory &o_r, const dfloat tol, const int MAXIT, const int verbose); - void PlotFields(dfloat* Q, char *fileName); + void PlotFields(memory& Q, std::string fileName); - void Operator(occa::memory& o_q, occa::memory& o_Aq); + void Operator(deviceMemory& o_q, deviceMemory& o_Aq); void BuildOperatorMatrixIpdg(parAlmond::parCOO& A); void BuildOperatorMatrixContinuous(parAlmond::parCOO& A); @@ -141,27 +146,27 @@ class elliptic_t: public solver_t { void BuildOperatorMatrixIpdgTet3D(parAlmond::parCOO& A); void BuildOperatorMatrixIpdgHex3D(parAlmond::parCOO& A); - void BuildOperatorDiagonal(dfloat *diagA); + void BuildOperatorDiagonal(memory& diagA); - void BuildOperatorDiagonalContinuousTri2D(dfloat *diagA); - void BuildOperatorDiagonalContinuousTri3D(dfloat *diagA); - void BuildOperatorDiagonalContinuousQuad2D(dfloat *diagA); - void BuildOperatorDiagonalContinuousQuad3D(dfloat *diagA); - void BuildOperatorDiagonalContinuousTet3D(dfloat *diagA); - void BuildOperatorDiagonalContinuousHex3D(dfloat *diagA); + void BuildOperatorDiagonalContinuousTri2D(memory& diagA); + void BuildOperatorDiagonalContinuousTri3D(memory& diagA); + void BuildOperatorDiagonalContinuousQuad2D(memory& diagA); + void BuildOperatorDiagonalContinuousQuad3D(memory& diagA); + void BuildOperatorDiagonalContinuousTet3D(memory& diagA); + void BuildOperatorDiagonalContinuousHex3D(memory& diagA); - void BuildOperatorDiagonalIpdgTri2D(dfloat *diagA); - void BuildOperatorDiagonalIpdgTri3D(dfloat *diagA); - void BuildOperatorDiagonalIpdgQuad2D(dfloat *diagA); - void BuildOperatorDiagonalIpdgQuad3D(dfloat *diagA); - void BuildOperatorDiagonalIpdgTet3D(dfloat *diagA); - void BuildOperatorDiagonalIpdgHex3D(dfloat *diagA); + void BuildOperatorDiagonalIpdgTri2D(memory& diagA); + void BuildOperatorDiagonalIpdgTri3D(memory& diagA); + void BuildOperatorDiagonalIpdgQuad2D(memory& diagA); + void BuildOperatorDiagonalIpdgQuad3D(memory& diagA); + void BuildOperatorDiagonalIpdgTet3D(memory& diagA); + void BuildOperatorDiagonalIpdgHex3D(memory& diagA); - elliptic_t& SetupNewDegree(mesh_t& meshF); + elliptic_t SetupNewDegree(mesh_t& meshF); - elliptic_t* SetupRingPatch(mesh_t& meshPatch); + elliptic_t SetupRingPatch(mesh_t& meshPatch); - void ZeroMean(occa::memory &o_q); + void ZeroMean(deviceMemory &o_q); }; diff --git a/solvers/elliptic/ellipticMain.cpp b/solvers/elliptic/ellipticMain.cpp index 5aa74d071..81977f4a8 100644 --- a/solvers/elliptic/ellipticMain.cpp +++ b/solvers/elliptic/ellipticMain.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -29,48 +29,52 @@ SOFTWARE. int main(int argc, char **argv){ // start up MPI - MPI_Init(&argc, &argv); + Comm::Init(argc, argv); - MPI_Comm comm = MPI_COMM_WORLD; + LIBP_ABORT("Usage: ./ellipticMain setupfile", argc!=2); - if(argc!=2) - LIBP_ABORT(string("Usage: ./ellipticMain setupfile")); + { /*Scope so everything is destructed before MPI_Finalize */ + comm_t comm(Comm::World().Dup()); - //create default settings - platformSettings_t platformSettings(comm); - meshSettings_t meshSettings(comm); - ellipticSettings_t ellipticSettings(comm); - ellipticAddRunSettings(ellipticSettings); + //create default settings + platformSettings_t platformSettings(comm); + meshSettings_t meshSettings(comm); + ellipticSettings_t ellipticSettings(comm); + ellipticAddRunSettings(ellipticSettings); - //load settings from file - ellipticSettings.parseFromFile(platformSettings, meshSettings, - argv[1]); + //load settings from file + ellipticSettings.parseFromFile(platformSettings, meshSettings, + argv[1]); - // set up platform - platform_t platform(platformSettings); + // set up platform + platform_t platform(platformSettings); - platformSettings.report(); - meshSettings.report(); - ellipticSettings.report(); + platformSettings.report(); + meshSettings.report(); + ellipticSettings.report(); - // set up mesh - mesh_t& mesh = mesh_t::Setup(platform, meshSettings, comm); + // set up mesh + mesh_t mesh(platform, meshSettings, comm); - dfloat lambda = 0.0; - ellipticSettings.getSetting("LAMBDA", lambda); + dfloat lambda = 0.0; + ellipticSettings.getSetting("LAMBDA", lambda); - // Boundary Type translation. Just defaults. - int NBCTypes = 3; - int BCType[NBCTypes] = {0,1,2}; + // Boundary Type translation. Just defaults. + int NBCTypes = 3; + memory BCType(3); + BCType[0] = 0; + BCType[1] = 1; + BCType[2] = 2; - // set up elliptic solver - elliptic_t& elliptic = elliptic_t::Setup(platform, mesh, ellipticSettings, - lambda, NBCTypes, BCType); + // set up elliptic solver + elliptic_t elliptic(platform, mesh, ellipticSettings, + lambda, NBCTypes, BCType); - // run - elliptic.Run(); + // run + elliptic.Run(); + } // close down MPI - MPI_Finalize(); + Comm::Finalize(); return LIBP_SUCCESS; } diff --git a/solvers/elliptic/ellipticPrecon.hpp b/solvers/elliptic/ellipticPrecon.hpp index 0e735bd4b..65b1ae284 100644 --- a/solvers/elliptic/ellipticPrecon.hpp +++ b/solvers/elliptic/ellipticPrecon.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -31,151 +31,112 @@ SOFTWARE. #include "parAlmond.hpp" //Jacobi preconditioner -class JacobiPrecon: public precon_t { +class JacobiPrecon: public operator_t { private: - elliptic_t& elliptic; + elliptic_t elliptic; - occa::memory o_invDiagA; + deviceMemory o_invDiagA; public: + JacobiPrecon() = default; JacobiPrecon(elliptic_t& elliptic); - void Operator(occa::memory& o_r, occa::memory& o_Mr); + void Operator(deviceMemory& o_r, deviceMemory& o_Mr); }; //Inverse Mass Matrix preconditioner -class MassMatrixPrecon: public precon_t { +class MassMatrixPrecon: public operator_t { private: - elliptic_t& elliptic; - mesh_t& mesh; - settings_t& settings; + elliptic_t elliptic; + mesh_t mesh; + settings_t settings; - occa::memory o_MrL, o_rtmp; - occa::memory o_invMM; + deviceMemory o_MrL, o_rtmp; + deviceMemory o_invMM; - occa::kernel blockJacobiKernel; - occa::kernel partialBlockJacobiKernel; + kernel_t blockJacobiKernel; + kernel_t partialBlockJacobiKernel; public: - ~MassMatrixPrecon(); + MassMatrixPrecon() = default; MassMatrixPrecon(elliptic_t& elliptic); - void Operator(occa::memory& o_r, occa::memory& o_Mr); + void Operator(deviceMemory& o_r, deviceMemory& o_Mr); }; //ParAlmond AMG preconditioner -class ParAlmondPrecon: public precon_t { +class ParAlmondPrecon: public operator_t { private: - elliptic_t& elliptic; - settings_t& settings; + elliptic_t elliptic; + settings_t settings; parAlmond::parAlmond_t parAlmond; - dfloat *xG, *rhsG; - occa::memory o_xG, o_rhsG; + memory xG, rhsG; + deviceMemory o_xG, o_rhsG; public: - ~ParAlmondPrecon(); + ParAlmondPrecon() = default; ParAlmondPrecon(elliptic_t& elliptic); - void Operator(occa::memory& o_r, occa::memory& o_Mr); + void Operator(deviceMemory& o_r, deviceMemory& o_Mr); }; // Matrix-free p-Multigrid levels followed by AMG -class MultiGridPrecon: public precon_t { +class MultiGridPrecon: public operator_t { private: - elliptic_t& elliptic; - mesh_t& mesh; - settings_t& settings; + elliptic_t elliptic; + mesh_t mesh; + settings_t settings; parAlmond::parAlmond_t parAlmond; public: + MultiGridPrecon() = default; MultiGridPrecon(elliptic_t& elliptic); - ~MultiGridPrecon() = default; - void Operator(occa::memory& o_r, occa::memory& o_Mr); + void Operator(deviceMemory& o_r, deviceMemory& o_Mr); }; // Cast problem into spectrally-equivalent N=1 FEM space and precondition with AMG -class SEMFEMPrecon: public precon_t { +class SEMFEMPrecon: public operator_t { private: - elliptic_t& elliptic; - mesh_t& mesh; - settings_t& settings; + elliptic_t elliptic; + mesh_t mesh; + settings_t settings; - mesh_t *femMesh; - elliptic_t* femElliptic; + mesh_t femMesh; + elliptic_t femElliptic; parAlmond::parAlmond_t parAlmond; - occa::memory o_MrL; + deviceMemory o_MrL; - occa::memory o_zFEM, o_rFEM; - occa::memory o_GzFEM, o_GrFEM; + deviceMemory o_zFEM, o_rFEM; + deviceMemory o_GzFEM, o_GrFEM; - ogs_t *FEMogs; + ogs::ogs_t FEMogs; - occa::kernel SEMFEMInterpKernel; - occa::kernel SEMFEMAnterpKernel; + kernel_t SEMFEMInterpKernel; + kernel_t SEMFEMAnterpKernel; public: - ~SEMFEMPrecon(); + SEMFEMPrecon() = default; SEMFEMPrecon(elliptic_t& elliptic); - void Operator(occa::memory& o_r, occa::memory& o_Mr); + void Operator(deviceMemory& o_r, deviceMemory& o_Mr); }; -class MGLevel; -// Overlapping additive Schwarz with patch problems consisting of the -// entire local mesh + 1 ring overlap, solved with a local multigrid -// precon and coarse problem consisting of the global degree 1 -// problem, solved with parAlmond -class OASPrecon: public precon_t { -private: - elliptic_t& elliptic; - mesh_t& mesh; - settings_t& settings; - - //Patch precon - mesh_t* meshPatch; - elliptic_t* ellipticPatch; - precon_t *preconPatch; - MGLevel *level; - - ogs_t *ogsMaskedRing; //ogs for 1-ring patch - - //Coarse Precon - ogs_t *ogsMasked=nullptr; - parAlmond::parAlmond_t parAlmond; - - dfloat *rPatch, *zPatch; - dfloat *rPatchL, *zPatchL; - occa::memory o_rPatch, o_zPatch; - occa::memory o_rPatchL, o_zPatchL; - - dfloat *rC, *zC; - occa::memory o_rC, o_zC; - - dfloat *patchWeight; - occa::memory o_patchWeight; - -public: - ~OASPrecon(); - OASPrecon(elliptic_t& elliptic); - void Operator(occa::memory& o_r, occa::memory& o_Mr); -}; class MGLevel: public parAlmond::multigridLevel { public: - elliptic_t& elliptic; - mesh_t& mesh; - linAlg_t& linAlg; + elliptic_t elliptic; + mesh_t mesh; //prologation - dfloat *P; - occa::memory o_P; + memory P; + deviceMemory o_P; - occa::kernel coarsenKernel, partialCoarsenKernel; - occa::kernel prolongateKernel, partialProlongateKernel; + kernel_t coarsenKernel, partialCoarsenKernel; + kernel_t prolongateKernel, partialProlongateKernel; - //coarse gather op - mesh_t *meshC=nullptr; - ogs_t *ogsMaskedC=nullptr; + //coarse space + elliptic_t ellipticC; + mesh_t meshC; //smoothing params typedef enum {JACOBI=1, @@ -185,35 +146,35 @@ class MGLevel: public parAlmond::multigridLevel { dfloat lambda1, lambda0; int ChebyshevIterations; - static size_t smootherResidualBytes, scratchBytes; - static dfloat *smootherResidual; - static occa::memory o_smootherResidual; - static occa::memory o_smootherResidual2; - static occa::memory o_smootherUpdate; - static occa::memory o_transferScratch; + static dlong NsmootherResidual, Nscratch; + static memory smootherResidual; + static deviceMemory o_smootherResidual; + static deviceMemory o_smootherResidual2; + static deviceMemory o_smootherUpdate; + static deviceMemory o_transferScratch; //jacobi data - occa::memory o_invDiagA; + deviceMemory o_invDiagA; //build a p-multigrid level and connect it to the next one + MGLevel() = default; MGLevel(elliptic_t& _elliptic, dlong _Nrows, dlong _Ncols, int Nc, int NpCoarse); - ~MGLevel(); - void Operator(occa::memory &o_X, occa::memory &o_Ax); + void Operator(deviceMemory &o_X, deviceMemory &o_Ax); - void residual(occa::memory &o_RHS, occa::memory &o_X, occa::memory &o_RES); + void residual(deviceMemory &o_RHS, deviceMemory &o_X, deviceMemory &o_RES); - void coarsen(occa::memory &o_X, occa::memory &o_Cx); + void coarsen(deviceMemory &o_X, deviceMemory &o_Cx); - void prolongate(occa::memory &o_X, occa::memory &o_Px); + void prolongate(deviceMemory &o_X, deviceMemory &o_Px); //smoother ops - void smooth(occa::memory &o_RHS, occa::memory &o_X, bool x_is_zero); + void smooth(deviceMemory &o_RHS, deviceMemory &o_X, bool x_is_zero); - void smoothJacobi (occa::memory &o_r, occa::memory &o_X, bool xIsZero); - void smoothChebyshev (occa::memory &o_r, occa::memory &o_X, bool xIsZero); + void smoothJacobi (deviceMemory &o_r, deviceMemory &o_X, bool xIsZero); + void smoothChebyshev (deviceMemory &o_r, deviceMemory &o_X, bool xIsZero); void Report(); @@ -223,5 +184,44 @@ class MGLevel: public parAlmond::multigridLevel { void AllocateStorage(); }; +// Overlapping additive Schwarz with patch problems consisting of the +// entire local mesh + 1 ring overlap, solved with a local multigrid +// precon and coarse problem consisting of the global degree 1 +// problem, solved with parAlmond +class OASPrecon: public operator_t { +private: + elliptic_t elliptic; + mesh_t mesh; + settings_t settings; + + //Patch precon + mesh_t meshPatch; + elliptic_t ellipticPatch; + precon_t preconPatch; + MGLevel level; + + ogs::ogs_t ogsMaskedRing; //ogs for 1-ring patch + + //Coarse Precon + ogs::ogs_t ogsMasked; + parAlmond::parAlmond_t parAlmond; + + memory rPatch, zPatch; + memory rPatchL, zPatchL; + deviceMemory o_rPatch, o_zPatch; + deviceMemory o_rPatchL, o_zPatchL; + + memory rC, zC; + deviceMemory o_rC, o_zC; + + memory patchWeight; + deviceMemory o_patchWeight; + +public: + OASPrecon() = default; + OASPrecon(elliptic_t& elliptic); + void Operator(deviceMemory& o_r, deviceMemory& o_Mr); +}; + -#endif \ No newline at end of file +#endif diff --git a/solvers/elliptic/makefile b/solvers/elliptic/makefile index 18183d2ee..8b9496233 100644 --- a/solvers/elliptic/makefile +++ b/solvers/elliptic/makefile @@ -2,7 +2,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal @@ -78,11 +78,8 @@ include ../../make.top endif endif -#gslib -GS_DIR=${LIBP_TPL_DIR}/gslib - #libraries -ELLIPTIC_LIBP_LIBS=parAlmond linearSolver mesh ogs linAlg core +ELLIPTIC_LIBP_LIBS=parAlmond linearSolver mesh parAdogs ogs linAlg core #includes INCLUDES=${LIBP_INCLUDES} \ @@ -92,11 +89,10 @@ DEFINES =${LIBP_DEFINES} \ -DLIBP_DIR='"${LIBP_DIR}"' #.cpp compilation flags -ELLIPTIC_CXXFLAGS=${LIBP_MPICXXFLAGS} ${DEFINES} ${INCLUDES} +ELLIPTIC_CXXFLAGS=${LIBP_CXXFLAGS} ${DEFINES} ${INCLUDES} #link libraries LIBS=-L${LIBP_LIBS_DIR} $(addprefix -l,$(ELLIPTIC_LIBP_LIBS)) \ - -L$(GS_DIR)/lib -lgs \ ${LIBP_LIBS} #link flags @@ -144,10 +140,10 @@ endif # rule for .cpp files %.o: %.cpp $(DEPS) | libp_libs ifneq (,${verbose}) - $(LIBP_MPICXX) -o $*.o -c $*.cpp $(ELLIPTIC_CXXFLAGS) + $(LIBP_CXX) -o $*.o -c $*.cpp $(ELLIPTIC_CXXFLAGS) else @printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n"; - @$(LIBP_MPICXX) -o $*.o -c $*.cpp $(ELLIPTIC_CXXFLAGS) + @$(LIBP_CXX) -o $*.o -c $*.cpp $(ELLIPTIC_CXXFLAGS) endif #cleanup @@ -158,8 +154,7 @@ clean-libs: clean ${MAKE} -C ${LIBP_LIBS_DIR} clean clean-kernels: clean-libs -# $(shell ${OCCA_DIR}/bin/occa clear all -y) - rm -rf ~/.occa/ + rm -rf ${LIBP_DIR}/.occa/ realclean: clean ${MAKE} -C ${LIBP_LIBS_DIR} realclean diff --git a/solvers/elliptic/okl/ellipticAddBCHex3D.okl b/solvers/elliptic/okl/ellipticAddBCHex3D.okl index 085e6e35b..634492c0e 100644 --- a/solvers/elliptic/okl/ellipticAddBCHex3D.okl +++ b/solvers/elliptic/okl/ellipticAddBCHex3D.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/elliptic/okl/ellipticAddBCQuad2D.okl b/solvers/elliptic/okl/ellipticAddBCQuad2D.okl index 5f0db75fd..17025be10 100644 --- a/solvers/elliptic/okl/ellipticAddBCQuad2D.okl +++ b/solvers/elliptic/okl/ellipticAddBCQuad2D.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -46,4 +46,4 @@ SOFTWARE. } } } -} \ No newline at end of file +} diff --git a/solvers/elliptic/okl/ellipticAddBCQuad3D.okl b/solvers/elliptic/okl/ellipticAddBCQuad3D.okl index f62e51948..8236ce41c 100644 --- a/solvers/elliptic/okl/ellipticAddBCQuad3D.okl +++ b/solvers/elliptic/okl/ellipticAddBCQuad3D.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/elliptic/okl/ellipticAddBCTet3D.okl b/solvers/elliptic/okl/ellipticAddBCTet3D.okl index 7a29b10fa..4d93151f4 100644 --- a/solvers/elliptic/okl/ellipticAddBCTet3D.okl +++ b/solvers/elliptic/okl/ellipticAddBCTet3D.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/elliptic/okl/ellipticAddBCTri2D.okl b/solvers/elliptic/okl/ellipticAddBCTri2D.okl index 2f905a125..b5f8e124a 100644 --- a/solvers/elliptic/okl/ellipticAddBCTri2D.okl +++ b/solvers/elliptic/okl/ellipticAddBCTri2D.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/elliptic/okl/ellipticAxHex3D.okl b/solvers/elliptic/okl/ellipticAxHex3D.okl index bfb2b7ca7..7a3ea1ce0 100644 --- a/solvers/elliptic/okl/ellipticAxHex3D.okl +++ b/solvers/elliptic/okl/ellipticAxHex3D.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,6 +26,7 @@ SOFTWARE. @kernel void ellipticAxHex3D(const dlong Nelements, + @restrict const dfloat * wJ, @restrict const dfloat * ggeo, @restrict const dfloat * DT, @restrict const dfloat * S, @@ -81,12 +82,10 @@ SOFTWARE. r_G12 = ggeo[gbase+p_G12ID*p_Np]; r_G22 = ggeo[gbase+p_G22ID*p_Np]; - r_GwJ = ggeo[gbase+p_GWJID*p_Np]; + r_GwJ = wJ[e*p_Np + k*p_Nq*p_Nq + j*p_Nq + i]; } } - @barrier("local"); - for(int j=0;j - @barrier("local"); squareThreads{ s_q[j][i] = r_G00*r_qr + r_G01*r_qs; } - @barrier("local"); squareThreads{ dfloat tmp = 0.f; @@ -105,13 +103,11 @@ SOFTWARE. } // s term ----> - @barrier("local"); squareThreads{ s_q[j][i] = r_G01*r_qr + r_G11*r_qs; } - @barrier("local"); squareThreads{ dfloat tmp = 0.f; @@ -133,6 +129,7 @@ SOFTWARE. @kernel void ellipticPartialAxQuad2D(const dlong Nelements, @restrict const dlong * elementList, @restrict const dlong * GlobalToLocal, + @restrict const dfloat * wJ, @restrict const dfloat * ggeo, @restrict const dfloat * DT, @restrict const dfloat * S, @@ -161,14 +158,13 @@ SOFTWARE. s_DT[j][i] = DT[j*p_Nq+i]; } - @barrier("local"); squareThreads{ const dlong base = element*p_Nggeo*p_Np + j*p_Nq + i; // assumes w*J built into G entries - r_GwJ = ggeo[base+p_GWJID*p_Np]; + r_GwJ = wJ[element*p_Np + j*p_Nq + i]; r_G00 = ggeo[base+p_G00ID*p_Np]; r_G01 = ggeo[base+p_G01ID*p_Np]; @@ -189,13 +185,11 @@ SOFTWARE. } // r term -----> - @barrier("local"); squareThreads{ s_q[j][i] = r_G00*r_qr + r_G01*r_qs; } - @barrier("local"); squareThreads{ dfloat tmp = 0.f; @@ -208,13 +202,11 @@ SOFTWARE. } // s term ----> - @barrier("local"); squareThreads{ s_q[j][i] = r_G01*r_qr + r_G11*r_qs; } - @barrier("local"); squareThreads{ dfloat tmp = 0.f; diff --git a/solvers/elliptic/okl/ellipticAxQuad3D.okl b/solvers/elliptic/okl/ellipticAxQuad3D.okl index 8c1323a4e..d953f3a36 100644 --- a/solvers/elliptic/okl/ellipticAxQuad3D.okl +++ b/solvers/elliptic/okl/ellipticAxQuad3D.okl @@ -2,7 +2,7 @@ The MIT License (MIT) - Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,19 +27,20 @@ // hex @kernel for screened coulomb potential mat-vec #define squareThreads \ - for(int j=0; j - @barrier("local"); squareThreads{ // s_q[j][i] = r_G00*r_qr + r_G01*r_qs + r_G02*r_q; s_q[j][i] = r_G00*r_qr + r_G01*r_qs; } - @barrier("local"); squareThreads{ dfloat tmp = 0.f; #pragma unroll p_Nq for(int n=0;n - @barrier("local"); squareThreads{ // s_q[j][i] = r_G01*r_qr + r_G11*r_qs + 0.f*r_G12*r_q; s_q[j][i] = r_G01*r_qr + r_G11*r_qs; } - @barrier("local"); squareThreads{ // dfloat tmp = r_G22*r_q; @@ -131,7 +127,7 @@ #pragma unroll p_Nq for(int n=0;n - @barrier("local"); squareThreads{ // s_q[j][i] = r_G00*r_qr + r_G01*r_qs + r_G02*r_q; s_q[j][i] = r_G00*r_qr + r_G01*r_qs; } - @barrier("local"); squareThreads{ dfloat tmp = 0.f; #pragma unroll p_Nq for(int n=0;n - @barrier("local"); squareThreads{ // s_q[j][i] = r_G01*r_qr + r_G11*r_qs + r_G12*r_q; s_q[j][i] = r_G01*r_qr + r_G11*r_qs; } - @barrier("local"); squareThreads{ // dfloat tmp = r_G22*r_q; @@ -244,7 +236,7 @@ #pragma unroll p_Nq for(int n=0;n c -> a - @barrier("local"); - + // test in b for(int k=0;k - @barrier("local"); for(int j=0;j - @barrier("local"); for(int j=0;j - @barrier("local"); for(int j=0;j - @barrier("local"); for(int j=0;j void elliptic_t::BoundarySetup(){ //check all the bounaries for a Dirichlet - int localAllNeumann = (lambda==0) ? 1 : 0; //if lambda>0 we don't care about all Neumann problem + allNeumann = (lambda==0) ? 1 : 0; //if lambda>0 we don't care about all Neumann problem allNeumannPenalty = 1.; - //setup normalization constant - if (settings.compareSetting("DISCRETIZATION","IPDG")) { - allNeumannScale = 1./sqrt((dfloat)mesh.Np*mesh.NelementsGlobal); - } else { - //note that we can use the mesh ogs, since there are no masked nodes - allNeumannScale = 1./sqrt((dfloat)mesh.ogs->NgatherGlobal); - } - - //setup a custom element-to-boundaryflag mapping - EToB = (int *) calloc(mesh.Nelements*mesh.Nfaces,sizeof(int)); + //translate the mesh's element-to-boundaryflag mapping + EToB.malloc(mesh.Nelements*mesh.Nfaces, 0); for (dlong e=0;e0) { - int BC = BCType[bc]; //translate mesh's boundary flag - EToB[e*mesh.Nfaces+f] = BC; //record it - if (BC!=2) localAllNeumann = 0; //check if its a Dirchlet + int BC = BCType[bc]; //translate mesh's boundary flag + EToB[e*mesh.Nfaces+f] = BC; //record it + if (BC!=2) allNeumann = 0; //check if its a Dirchlet } } } - o_EToB = platform.malloc(mesh.Nelements*mesh.Nfaces*sizeof(int), EToB); + o_EToB = platform.malloc(EToB); //collect the allNeumann flags from other ranks - MPI_Allreduce(&localAllNeumann, &allNeumann, 1, MPI_INT, MPI_MIN, mesh.comm); - + mesh.comm.Allreduce(allNeumann, Comm::Min); - //make a node-wise bc flag using the gsop (prioritize Dirichlet boundaries over Neumann) - mapB = (int *) calloc(mesh.Nelements*mesh.Np,sizeof(int)); - const int largeNumber = 1<<20; - for (dlong e=0;e0) { - for (int n=0;nGatherScatter(mapB, ogs_int, ogs_min, ogs_sym); - - //use the bc flags to find masked ids + //translate the mesh's node-wise bc flag Nmasked = 0; - for (dlong n=0;n0) { + int BC = BCType[bc]; //translate mesh's boundary flag + mapB[n] = BC; //record it + + if (mapB[n] == 1) Nmasked++; //Dirichlet boundary } } - o_mapB = platform.malloc(mesh.Nelements*mesh.Np*sizeof(int), mapB); + o_mapB = platform.malloc(mapB); - - maskIds = (dlong *) calloc(Nmasked, sizeof(dlong)); + maskIds.malloc(Nmasked); Nmasked =0; //reset - for (dlong n=0;n(maskIds); //make a masked version of the global id numbering - maskedGlobalIds = (hlong *) calloc(mesh.Nelements*mesh.Np,sizeof(hlong)); - memcpy(maskedGlobalIds, mesh.globalIds, mesh.Nelements*mesh.Np*sizeof(hlong)); - for (dlong n=0;nNgather; // number of degrees of freedom on this rank (after gathering) + hlong Ngather = ogsMasked.Ngather; // number of degrees of freedom on this rank (after gathering) // build inverse degree vectors // used for the weight in linear solvers (used in C0) - weight = (dfloat*) calloc(Ntotal, sizeof(dfloat)); - weightG = (dfloat*) calloc(ogsMasked->Ngather, sizeof(dfloat)); - for(dlong n=0;nGather(weightG, weight, ogs_dfloat, ogs_add, ogs_trans); - for(dlong n=0;nNgather;++n) - if (weightG[n]) weightG[n] = 1./weightG[n]; + weightG.malloc(Ngather); + ogsMasked.Gather(weightG, weight, 1, ogs::Add, ogs::Trans); - ogsMasked->Scatter(weight, weightG, ogs_dfloat, ogs_add, ogs_notrans); + for(dlong n=0;n0.0) weightG[n] = 1./weightG[n]; + } + + ogsMasked.Scatter(weight, weightG, 1, ogs::NoTrans); - o_weight = platform.malloc(Ntotal*sizeof(dfloat), weight); - o_weightG = platform.malloc(ogsMasked->Ngather*sizeof(dfloat), weightG); + o_weight = platform.malloc(weight); + o_weightG = platform.malloc(weightG); // create a global numbering system - hlong *globalIds = (hlong *) calloc(Ngather,sizeof(hlong)); + memory globalIds(Ngather); // every gathered degree of freedom has its own global id - hlong *globalStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong)); - MPI_Allgather(&Ngather, 1, MPI_HLONG, globalStarts+1, 1, MPI_HLONG, mesh.comm); - for(int r=0;r(Ngather); + comm.Scan(Ngather, globalOffset); + globalOffset = globalOffset-Ngather; //use the offsets to set a consecutive global numbering - for (dlong n =0;nNgather;n++) { - globalIds[n] = n + globalStarts[mesh.rank]; + for (dlong n =0;nScatter(maskedGlobalNumbering, globalIds, ogs_hlong, ogs_add, ogs_notrans); - free(globalIds); + maskedGlobalNumbering.malloc(Ntotal, -1); + ogsMasked.Scatter(maskedGlobalNumbering, globalIds, 1, ogs::NoTrans); /* Build halo exchange for gathered ordering */ - ogsMasked->GatheredHaloExchangeSetup(); + gHalo.SetupFromGather(ogsMasked); + + GlobalToLocal.malloc(mesh.Nelements*mesh.Np); + ogsMasked.SetupGlobalToLocalMapping(GlobalToLocal); + + o_GlobalToLocal = platform.malloc(GlobalToLocal); } diff --git a/solvers/elliptic/src/ellipticBuildOperatorDiagonal.cpp b/solvers/elliptic/src/ellipticBuildOperatorDiagonal.cpp index 23669611c..a984b8546 100644 --- a/solvers/elliptic/src/ellipticBuildOperatorDiagonal.cpp +++ b/solvers/elliptic/src/ellipticBuildOperatorDiagonal.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,16 +25,14 @@ SOFTWARE. */ #include "elliptic.hpp" -#include "mesh/meshDefines2D.h" -#include "mesh/meshDefines3D.h" -void elliptic_t::BuildOperatorDiagonal(dfloat *diagA){ +void elliptic_t::BuildOperatorDiagonal(memory& diagA){ - if(mesh.rank==0) {printf("Building diagonal...");fflush(stdout);} + if(Comm::World().rank()==0) {printf("Building diagonal...");fflush(stdout);} if (settings.compareSetting("DISCRETIZATION","IPDG")) { switch(mesh.elementType){ - case TRIANGLES: + case Mesh::TRIANGLES: { if(mesh.dim==2) BuildOperatorDiagonalIpdgTri2D(diagA); @@ -42,24 +40,24 @@ void elliptic_t::BuildOperatorDiagonal(dfloat *diagA){ BuildOperatorDiagonalIpdgTri3D(diagA); break; } - case QUADRILATERALS: + case Mesh::QUADRILATERALS: BuildOperatorDiagonalIpdgQuad2D(diagA); break; - case TETRAHEDRA: + case Mesh::TETRAHEDRA: BuildOperatorDiagonalIpdgTet3D(diagA); break; - case HEXAHEDRA: + case Mesh::HEXAHEDRA: BuildOperatorDiagonalIpdgHex3D(diagA); break; } } else if (settings.compareSetting("DISCRETIZATION","CONTINUOUS")) { - dfloat* diagAL = (dfloat*) malloc(mesh.Np*mesh.Nelements*sizeof(dfloat)); + memory diagAL(mesh.Np*mesh.Nelements); switch(mesh.elementType){ - case TRIANGLES: + case Mesh::TRIANGLES: BuildOperatorDiagonalContinuousTri2D(diagAL); break; - case QUADRILATERALS: + case Mesh::QUADRILATERALS: { if(mesh.dim==2) BuildOperatorDiagonalContinuousQuad2D(diagAL); @@ -67,25 +65,24 @@ void elliptic_t::BuildOperatorDiagonal(dfloat *diagA){ BuildOperatorDiagonalContinuousQuad3D(diagAL); break; } - case TETRAHEDRA: + case Mesh::TETRAHEDRA: BuildOperatorDiagonalContinuousTet3D(diagAL); break; - case HEXAHEDRA: + case Mesh::HEXAHEDRA: BuildOperatorDiagonalContinuousHex3D(diagAL); break; } //gather the diagonal to assemble it - ogsMasked->Gather(diagA, diagAL, ogs_dfloat, ogs_add, ogs_trans); - free(diagAL); + ogsMasked.Gather(diagA, diagAL, 1, ogs::Add, ogs::Trans); } - if(mesh.rank==0) printf("done.\n"); + if(Comm::World().rank()==0) printf("done.\n"); } -void elliptic_t::BuildOperatorDiagonalIpdgTri2D(dfloat *A) { +void elliptic_t::BuildOperatorDiagonalIpdgTri2D(memory& A) { // surface mass matrices MS = MM*LIFT - dfloat *MS = (dfloat *) calloc(mesh.Nfaces*mesh.Nfp*mesh.Nfp,sizeof(dfloat)); + memory MS(mesh.Nfaces*mesh.Nfp*mesh.Nfp); for (int f=0;f MSf = MS+fM*mesh.Nfp*mesh.Nfp; // penalty term just involves face nodes for(int n=0;n& A) { // surface mass matrices MS = MM*LIFT - dfloat *MS = (dfloat *) calloc(mesh.Nfaces*mesh.Nfp*mesh.Nfp,sizeof(dfloat)); + memory MS(mesh.Nfaces*mesh.Nfp*mesh.Nfp); for (int f=0;f MSf = MS+fM*mesh.Nfp*mesh.Nfp; // penalty term just involves face nodes for(int n=0;n& A) { for(dlong eM=0;eM& A) { // build some monolithic basis arrays (for quads and hexes) - dfloat *B = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat)); - dfloat *Br = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat)); - dfloat *Bs = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat)); + memory B (mesh.Np*mesh.Np, 0.0); + memory Br(mesh.Np*mesh.Np, 0.0); + memory Bs(mesh.Np*mesh.Np, 0.0); int mode = 0; for(int nj=0;nj& A) { for(dlong eM=0;eM& A) { // build some monolithic basis arrays (for quads and hexes) - dfloat *B = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat)); - dfloat *Br = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat)); - dfloat *Bs = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat)); + memory B (mesh.Np*mesh.Np, 0.0); + memory Br(mesh.Np*mesh.Np, 0.0); + memory Bs(mesh.Np*mesh.Np, 0.0); int mode = 0; for(int nj=0;nj& A) { for(dlong eM=0;eM& A) { // surface mass matrices MS = MM*LIFT - dfloat *MS = (dfloat *) calloc(mesh.Nfaces*mesh.Nfp*mesh.Nfp,sizeof(dfloat)); + memory MS(mesh.Nfaces*mesh.Nfp*mesh.Nfp); for (int f=0;f MSf = MS+fM*mesh.Nfp*mesh.Nfp; // penalty term just involves face nodes for(int n=0;n& A) { for(dlong eM=0;eM& A) { // build some monolithic basis arrays (for quads and hexes) - dfloat *B = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat)); - dfloat *Br = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat)); - dfloat *Bs = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat)); - dfloat *Bt = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat)); + memory B (mesh.Np*mesh.Np, 0.0); + memory Br(mesh.Np*mesh.Np, 0.0); + memory Bs(mesh.Np*mesh.Np, 0.0); + memory Bt(mesh.Np*mesh.Np, 0.0); int mode = 0; for(int nk=0;nk& A) { for(dlong eM=0;eM +using __gnu_parallel::sort; +#else +using std::sort; +#endif void elliptic_t::BuildOperatorMatrixContinuous(parAlmond::parCOO& A) { switch(mesh.elementType){ - case TRIANGLES: + case Mesh::TRIANGLES: BuildOperatorMatrixContinuousTri2D(A); break; - case QUADRILATERALS: + case Mesh::QUADRILATERALS: { if(mesh.dim==2) BuildOperatorMatrixContinuousQuad2D(A); @@ -42,9 +47,9 @@ void elliptic_t::BuildOperatorMatrixContinuous(parAlmond::parCOO& A) { break; } - case TETRAHEDRA: + case Mesh::TETRAHEDRA: BuildOperatorMatrixContinuousTet3D(A); break; - case HEXAHEDRA: + case Mesh::HEXAHEDRA: BuildOperatorMatrixContinuousHex3D(A); break; } } @@ -52,12 +57,12 @@ void elliptic_t::BuildOperatorMatrixContinuous(parAlmond::parCOO& A) { void elliptic_t::BuildOperatorMatrixContinuousTri2D(parAlmond::parCOO& A) { // number of degrees of freedom on this rank (after gathering) - hlong Ngather = ogsMasked->Ngather; + hlong Ngather = ogsMasked.Ngather; // every gathered degree of freedom has its own global id - A.globalRowStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong)); - A.globalColStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong)); - MPI_Allgather(&Ngather, 1, MPI_HLONG, A.globalRowStarts+1, 1, MPI_HLONG, mesh.comm); + A.globalRowStarts.malloc(mesh.size+1, 0); + A.globalColStarts.malloc(mesh.size+1, 0); + mesh.comm.Allgather(Ngather, A.globalRowStarts+1); for(int r=0;r sendNonZeros(nnzLocal); + memory AsendCounts (mesh.size, 0); + memory ArecvCounts (mesh.size); + memory AsendOffsets(mesh.size+1); + memory ArecvOffsets(mesh.size+1); - dfloat *Srr = mesh.Srr; - dfloat *Srs = mesh.Srs; - dfloat *Sss = mesh.Sss; - dfloat *MM = mesh.MM ; + memory Srr = mesh.Srr; + memory Srs = mesh.Srs; + memory Sss = mesh.Sss; + memory MM = mesh.MM ; - if(mesh.rank==0) {printf("Building full FEM matrix...");fflush(stdout);} + if(Comm::World().rank()==0) {printf("Building full FEM matrix...");fflush(stdout);} //Build unassembed non-zeros dlong cnt =0; for (dlong e=0;e b.row) return false; + sort(sendNonZeros.ptr(), sendNonZeros.ptr()+cnt, + [](const parAlmond::parCOO::nonZero_t& a, + const parAlmond::parCOO::nonZero_t& b) { + if (a.row < b.row) return true; + if (a.row > b.row) return false; - return a.col < b.col; - }); + return a.col < b.col; + }); // count how many non-zeros to send to each process int rr=0; @@ -130,32 +135,33 @@ void elliptic_t::BuildOperatorMatrixContinuousTri2D(parAlmond::parCOO& A) { } // find how many nodes to expect (should use sparse version) - MPI_Alltoall(AsendCounts, 1, MPI_INT, ArecvCounts, 1, MPI_INT, mesh.comm); + mesh.comm.Alltoall(AsendCounts, ArecvCounts); // find send and recv offsets for gather A.nnz = 0; + AsendOffsets[0] = 0; + ArecvOffsets[0] = 0; for(int r=0;r b.row) return false; + sort(A.entries.ptr(), A.entries.ptr()+A.nnz, + [](const parAlmond::parCOO::nonZero_t& a, + const parAlmond::parCOO::nonZero_t& b) { + if (a.row < b.row) return true; + if (a.row > b.row) return false; - return a.col < b.col; - }); + return a.col < b.col; + }); // compress duplicates cnt = 0; @@ -172,26 +178,19 @@ void elliptic_t::BuildOperatorMatrixContinuousTri2D(parAlmond::parCOO& A) { if (A.nnz) cnt++; A.nnz = cnt; - if(mesh.rank==0) printf("done.\n"); - - MPI_Barrier(mesh.comm); - free(sendNonZeros); - free(AsendCounts); - free(ArecvCounts); - free(AsendOffsets); - free(ArecvOffsets); + if(Comm::World().rank()==0) printf("done.\n"); } void elliptic_t::BuildOperatorMatrixContinuousQuad3D(parAlmond::parCOO& A) { // number of degrees of freedom on this rank (after gathering) - hlong Ngather = ogsMasked->Ngather; + hlong Ngather = ogsMasked.Ngather; // every gathered degree of freedom has its own global id - A.globalRowStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong)); - A.globalColStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong)); - MPI_Allgather(&Ngather, 1, MPI_HLONG, A.globalRowStarts+1, 1, MPI_HLONG, mesh.comm); + A.globalRowStarts.malloc(mesh.size+1, 0); + A.globalColStarts.malloc(mesh.size+1, 0); + mesh.comm.Allgather(Ngather, A.globalRowStarts+1); for(int r=0;r sendNonZeros(nnzLocal); + memory AsendCounts (mesh.size, 0); + memory ArecvCounts (mesh.size); + memory AsendOffsets(mesh.size+1); + memory ArecvOffsets(mesh.size+1); - if(mesh.rank==0) {printf("Building full FEM matrix...");fflush(stdout);} + if(Comm::World().rank()==0) {printf("Building full FEM matrix...");fflush(stdout);} #if 0 hlong NTf = mesh.Nelements*mesh.Np * mesh.Nelements*mesh.Np ; @@ -228,34 +227,34 @@ void elliptic_t::BuildOperatorMatrixContinuousQuad3D(parAlmond::parCOO& A) { if (ny==my) { for (int k=0;k b.row) return false; + sort(sendNonZeros.ptr(), sendNonZeros.ptr()+cnt, + [](const parAlmond::parCOO::nonZero_t& a, + const parAlmond::parCOO::nonZero_t& b) { + if (a.row < b.row) return true; + if (a.row > b.row) return false; - return a.col < b.col; - }); + return a.col < b.col; + }); // count how many non-zeros to send to each process int rr=0; @@ -339,32 +338,33 @@ void elliptic_t::BuildOperatorMatrixContinuousQuad3D(parAlmond::parCOO& A) { } // find how many nodes to expect (should use sparse version) - MPI_Alltoall(AsendCounts, 1, MPI_INT, ArecvCounts, 1, MPI_INT, mesh.comm); + mesh.comm.Alltoall(AsendCounts, ArecvCounts); // find send and recv offsets for gather A.nnz = 0; + AsendOffsets[0] = 0; + ArecvOffsets[0] = 0; for(int r=0;r b.row) return false; + sort(A.entries.ptr(), A.entries.ptr()+A.nnz, + [](const parAlmond::parCOO::nonZero_t& a, + const parAlmond::parCOO::nonZero_t& b) { + if (a.row < b.row) return true; + if (a.row > b.row) return false; - return a.col < b.col; - }); + return a.col < b.col; + }); // compress duplicates cnt = 0; @@ -395,26 +395,19 @@ void elliptic_t::BuildOperatorMatrixContinuousQuad3D(parAlmond::parCOO& A) { fclose(fp); #endif - if(mesh.rank==0) printf("done.\n"); - - MPI_Barrier(mesh.comm); - free(sendNonZeros); - free(AsendCounts); - free(ArecvCounts); - free(AsendOffsets); - free(ArecvOffsets); + if(Comm::World().rank()==0) printf("done.\n"); } void elliptic_t::BuildOperatorMatrixContinuousQuad2D(parAlmond::parCOO& A) { // number of degrees of freedom on this rank (after gathering) - hlong Ngather = ogsMasked->Ngather; + hlong Ngather = ogsMasked.Ngather; // every gathered degree of freedom has its own global id - A.globalRowStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong)); - A.globalColStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong)); - MPI_Allgather(&Ngather, 1, MPI_HLONG, A.globalRowStarts+1, 1, MPI_HLONG, mesh.comm); + A.globalRowStarts.malloc(mesh.size+1,0); + A.globalColStarts.malloc(mesh.size+1,0); + mesh.comm.Allgather(Ngather, A.globalRowStarts+1); for(int r=0;r sendNonZeros(nnzLocal); + memory AsendCounts (mesh.size, 0); + memory ArecvCounts (mesh.size); + memory AsendOffsets(mesh.size+1); + memory ArecvOffsets(mesh.size+1); - if(mesh.rank==0) {printf("Building full FEM matrix...");fflush(stdout);} + if(Comm::World().rank()==0) {printf("Building full FEM matrix...");fflush(stdout);} //Build unassembed non-zeros dlong cnt =0; @@ -446,25 +439,25 @@ void elliptic_t::BuildOperatorMatrixContinuousQuad2D(parAlmond::parCOO& A) { if (ny==my) { for (int k=0;k b.row) return false; + sort(sendNonZeros.ptr(), sendNonZeros.ptr()+cnt, + [](const parAlmond::parCOO::nonZero_t& a, + const parAlmond::parCOO::nonZero_t& b) { + if (a.row < b.row) return true; + if (a.row > b.row) return false; - return a.col < b.col; - }); + return a.col < b.col; + }); // count how many non-zeros to send to each process int rr=0; @@ -509,32 +502,33 @@ void elliptic_t::BuildOperatorMatrixContinuousQuad2D(parAlmond::parCOO& A) { } // find how many nodes to expect (should use sparse version) - MPI_Alltoall(AsendCounts, 1, MPI_INT, ArecvCounts, 1, MPI_INT, mesh.comm); + mesh.comm.Alltoall(AsendCounts, ArecvCounts); // find send and recv offsets for gather A.nnz = 0; + AsendOffsets[0] = 0; + ArecvOffsets[0] = 0; for(int r=0;r b.row) return false; + sort(A.entries.ptr(), A.entries.ptr()+A.nnz, + [](const parAlmond::parCOO::nonZero_t& a, + const parAlmond::parCOO::nonZero_t& b) { + if (a.row < b.row) return true; + if (a.row > b.row) return false; - return a.col < b.col; - }); + return a.col < b.col; + }); // compress duplicates cnt = 0; @@ -565,25 +559,18 @@ void elliptic_t::BuildOperatorMatrixContinuousQuad2D(parAlmond::parCOO& A) { fclose(fp); #endif - if(mesh.rank==0) printf("done.\n"); - - MPI_Barrier(mesh.comm); - free(sendNonZeros); - free(AsendCounts); - free(ArecvCounts); - free(AsendOffsets); - free(ArecvOffsets); + if(Comm::World().rank()==0) printf("done.\n"); } void elliptic_t::BuildOperatorMatrixContinuousTet3D(parAlmond::parCOO& A) { // number of degrees of freedom on this rank (after gathering) - hlong Ngather = ogsMasked->Ngather; + hlong Ngather = ogsMasked.Ngather; // every gathered degree of freedom has its own global id - A.globalRowStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong)); - A.globalColStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong)); - MPI_Allgather(&Ngather, 1, MPI_HLONG, A.globalRowStarts+1, 1, MPI_HLONG, mesh.comm); + A.globalRowStarts.malloc(mesh.size+1,0); + A.globalColStarts.malloc(mesh.size+1,0); + mesh.comm.Allgather(Ngather, A.globalRowStarts+1); for(int r=0;r sendNonZeros(nnzLocal); + memory AsendCounts (mesh.size, 0); + memory ArecvCounts (mesh.size); + memory AsendOffsets(mesh.size+1); + memory ArecvOffsets(mesh.size+1); //Build unassembed non-zeros - if(mesh.rank==0) {printf("Building full FEM matrix...");fflush(stdout);} + if(Comm::World().rank()==0) {printf("Building full FEM matrix...");fflush(stdout);} dlong cnt =0; //#pragma omp parallel for for (dlong e=0;e b.row) return false; + sort(sendNonZeros.ptr(), sendNonZeros.ptr()+cnt, + [](const parAlmond::parCOO::nonZero_t& a, + const parAlmond::parCOO::nonZero_t& b) { + if (a.row < b.row) return true; + if (a.row > b.row) return false; - return a.col < b.col; - }); + return a.col < b.col; + }); // count how many non-zeros to send to each process int rr=0; @@ -661,32 +648,33 @@ void elliptic_t::BuildOperatorMatrixContinuousTet3D(parAlmond::parCOO& A) { } // find how many nodes to expect (should use sparse version) - MPI_Alltoall(AsendCounts, 1, MPI_INT, ArecvCounts, 1, MPI_INT, mesh.comm); + mesh.comm.Alltoall(AsendCounts, ArecvCounts); // find send and recv offsets for gather A.nnz = 0; + AsendOffsets[0] = 0; + ArecvOffsets[0] = 0; for(int r=0;r b.row) return false; + sort(A.entries.ptr(), A.entries.ptr()+A.nnz, + [](const parAlmond::parCOO::nonZero_t& a, + const parAlmond::parCOO::nonZero_t& b) { + if (a.row < b.row) return true; + if (a.row > b.row) return false; - return a.col < b.col; - }); + return a.col < b.col; + }); // compress duplicates cnt = 0; @@ -703,25 +691,18 @@ void elliptic_t::BuildOperatorMatrixContinuousTet3D(parAlmond::parCOO& A) { if (A.nnz) cnt++; A.nnz = cnt; - if(mesh.rank==0) printf("done.\n"); - - MPI_Barrier(mesh.comm); - free(sendNonZeros); - free(AsendCounts); - free(ArecvCounts); - free(AsendOffsets); - free(ArecvOffsets); + if(Comm::World().rank()==0) printf("done.\n"); } void elliptic_t::BuildOperatorMatrixContinuousHex3D(parAlmond::parCOO& A) { // number of degrees of freedom on this rank (after gathering) - hlong Ngather = ogsMasked->Ngather; + hlong Ngather = ogsMasked.Ngather; // every gathered degree of freedom has its own global id - A.globalRowStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong)); - A.globalColStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong)); - MPI_Allgather(&Ngather, 1, MPI_HLONG, A.globalRowStarts+1, 1, MPI_HLONG, mesh.comm); + A.globalRowStarts.malloc(mesh.size+1,0); + A.globalColStarts.malloc(mesh.size+1,0); + mesh.comm.Allgather(Ngather, A.globalRowStarts+1); for(int r=0;r sendNonZeros(nnzLocal); + memory AsendCounts (mesh.size, 0); + memory ArecvCounts (mesh.size); + memory AsendOffsets(mesh.size+1); + memory ArecvOffsets(mesh.size+1); - if(mesh.rank==0) {printf("Building full FEM matrix...");fflush(stdout);} + if(Comm::World().rank()==0) {printf("Building full FEM matrix...");fflush(stdout);} dlong cnt =0; for (dlong e=0;e b.row) return false; + sort(sendNonZeros.ptr(), sendNonZeros.ptr()+cnt, + [](const parAlmond::parCOO::nonZero_t& a, + const parAlmond::parCOO::nonZero_t& b) { + if (a.row < b.row) return true; + if (a.row > b.row) return false; - return a.col < b.col; - }); + return a.col < b.col; + }); // count how many non-zeros to send to each process int rr=0; @@ -852,32 +833,33 @@ void elliptic_t::BuildOperatorMatrixContinuousHex3D(parAlmond::parCOO& A) { } // find how many nodes to expect (should use sparse version) - MPI_Alltoall(AsendCounts, 1, MPI_INT, ArecvCounts, 1, MPI_INT, mesh.comm); + mesh.comm.Alltoall(AsendCounts, ArecvCounts); // find send and recv offsets for gather A.nnz = 0; + AsendOffsets[0] = 0; + ArecvOffsets[0] = 0; for(int r=0;r b.row) return false; + sort(A.entries.ptr(), A.entries.ptr()+A.nnz, + [](const parAlmond::parCOO::nonZero_t& a, + const parAlmond::parCOO::nonZero_t& b) { + if (a.row < b.row) return true; + if (a.row > b.row) return false; - return a.col < b.col; - }); + return a.col < b.col; + }); // compress duplicates cnt = 0; @@ -894,12 +876,5 @@ void elliptic_t::BuildOperatorMatrixContinuousHex3D(parAlmond::parCOO& A) { if (A.nnz) cnt++; A.nnz = cnt; - if(mesh.rank==0) printf("done.\n"); - - MPI_Barrier(mesh.comm); - free(sendNonZeros); - free(AsendCounts); - free(ArecvCounts); - free(AsendOffsets); - free(ArecvOffsets); + if(Comm::World().rank()==0) printf("done.\n"); } diff --git a/solvers/elliptic/src/ellipticBuildOperatorMatrixIpdg.cpp b/solvers/elliptic/src/ellipticBuildOperatorMatrixIpdg.cpp index 5dc01bf1d..c4abf59a9 100644 --- a/solvers/elliptic/src/ellipticBuildOperatorMatrixIpdg.cpp +++ b/solvers/elliptic/src/ellipticBuildOperatorMatrixIpdg.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,13 +25,18 @@ SOFTWARE. */ #include "elliptic.hpp" -#include "mesh/meshDefines2D.h" -#include "mesh/meshDefines3D.h" + +#ifdef GLIBCXX_PARALLEL +#include +using __gnu_parallel::sort; +#else +using std::sort; +#endif void elliptic_t::BuildOperatorMatrixIpdg(parAlmond::parCOO& A){ switch(mesh.elementType){ - case TRIANGLES: + case Mesh::TRIANGLES: { if(mesh.dim==2) BuildOperatorMatrixIpdgTri2D(A); @@ -39,25 +44,22 @@ void elliptic_t::BuildOperatorMatrixIpdg(parAlmond::parCOO& A){ BuildOperatorMatrixIpdgTri3D(A); break; } - case QUADRILATERALS:{ + case Mesh::QUADRILATERALS:{ if(mesh.dim==2) BuildOperatorMatrixIpdgQuad2D(A); else BuildOperatorMatrixIpdgQuad3D(A); break; } - case TETRAHEDRA: + case Mesh::TETRAHEDRA: BuildOperatorMatrixIpdgTet3D(A); break; - case HEXAHEDRA: + case Mesh::HEXAHEDRA: BuildOperatorMatrixIpdgHex3D(A); break; } - } void elliptic_t::BuildOperatorMatrixIpdgTri2D(parAlmond::parCOO& A){ - int rankM = mesh.rank; - int Np = mesh.Np; int Nfp = mesh.Nfp; int Nfaces = mesh.Nfaces; @@ -67,35 +69,31 @@ void elliptic_t::BuildOperatorMatrixIpdgTri2D(parAlmond::parCOO& A){ hlong Nnum = Np*Nelements; // create a global numbering system - hlong *globalIds = (hlong *) calloc((Nelements+mesh.totalHaloPairs)*Np,sizeof(hlong)); + memory globalIds((Nelements+mesh.totalHaloPairs)*Np); // every degree of freedom has its own global id - A.globalRowStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong)); - A.globalColStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong)); - MPI_Allgather(&Nnum, 1, MPI_HLONG, A.globalRowStarts+1, 1, MPI_HLONG, mesh.comm); + A.globalRowStarts.malloc(mesh.size+1,0); + A.globalColStarts.malloc(mesh.size+1,0); + mesh.comm.Allgather(Nnum, A.globalRowStarts+1); for(int r=0;rExchange(globalIds, Np, ogs_hlong); + mesh.halo.Exchange(globalIds, Np); dlong nnzLocalBound = Np*Np*(1+Nfaces)*Nelements; @@ -103,7 +101,7 @@ void elliptic_t::BuildOperatorMatrixIpdgTri2D(parAlmond::parCOO& A){ dfloat tol = 1e-8; // surface mass matrices MS = MM*LIFT - dfloat *MS = (dfloat *) calloc(Nfaces*Nfp*Nfp,sizeof(dfloat)); + memory MS(Nfaces*Nfp*Nfp); for (int f=0;f SM(Np*Np); + memory SP(Np*Np); - if(rankM==0) {printf("Building full IPDG matrix...");fflush(stdout);} + if(Comm::World().rank()==0) {printf("Building full IPDG matrix...");fflush(stdout);} // loop over all elements for(dlong eM=0;eM MSf = MS+fM*Nfp*Nfp; // penalty term just involves face nodes for(int n=0;ntol){ + if(std::abs(val)>tol){ A.entries[nnz].row = globalIds[eM*Np + n]; A.entries[nnz].col = globalIds[eP*Np + m]; A.entries[nnz].val = val; @@ -277,7 +275,7 @@ void elliptic_t::BuildOperatorMatrixIpdgTri2D(parAlmond::parCOO& A){ for(int n=0;ntol){ + if(std::abs(val)>tol){ A.entries[nnz].row = globalIds[eM*Np + n]; A.entries[nnz].col = globalIds[eM*Np + m]; A.entries[nnz].val = val; @@ -289,19 +287,19 @@ void elliptic_t::BuildOperatorMatrixIpdgTri2D(parAlmond::parCOO& A){ //printf("nnz = %d\n", nnz); - std::sort(A.entries, A.entries+nnz, - [](const parAlmond::parCOO::nonZero_t& a, - const parAlmond::parCOO::nonZero_t& b) { - if (a.row < b.row) return true; - if (a.row > b.row) return false; + sort(A.entries.ptr(), A.entries.ptr()+nnz, + [](const parAlmond::parCOO::nonZero_t& a, + const parAlmond::parCOO::nonZero_t& b) { + if (a.row < b.row) return true; + if (a.row > b.row) return false; - return a.col < b.col; - }); + return a.col < b.col; + }); //*A = (parAlmond::parCOO::nonZero_t*) realloc(*A, nnz*sizeof(parAlmond::parCOO::nonZero_t)); A.nnz = nnz; - if(rankM==0) printf("done.\n"); + if(Comm::World().rank()==0) printf("done.\n"); #if 0 dfloat* Ap = (dfloat *) calloc(Np*Np*Nelements*Nelements,sizeof(dfloat)); @@ -319,17 +317,10 @@ void elliptic_t::BuildOperatorMatrixIpdgTri2D(parAlmond::parCOO& A){ printf("\n"); } #endif - - free(globalIds); - - free(SM); free(SP); - free(MS); } void elliptic_t::BuildOperatorMatrixIpdgTri3D(parAlmond::parCOO& A){ - int rankM = mesh.rank; - int Np = mesh.Np; int Nfp = mesh.Nfp; int Nfaces = mesh.Nfaces; @@ -339,35 +330,31 @@ void elliptic_t::BuildOperatorMatrixIpdgTri3D(parAlmond::parCOO& A){ hlong Nnum = Np*Nelements; // create a global numbering system - hlong *globalIds = (hlong *) calloc((Nelements+mesh.totalHaloPairs)*Np,sizeof(hlong)); + memory globalIds((Nelements+mesh.totalHaloPairs)*Np); // every degree of freedom has its own global id - A.globalRowStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong)); - A.globalColStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong)); - MPI_Allgather(&Nnum, 1, MPI_HLONG, A.globalRowStarts+1, 1, MPI_HLONG, mesh.comm); + A.globalRowStarts.malloc(mesh.size+1,0); + A.globalColStarts.malloc(mesh.size+1,0); + mesh.comm.Allgather(Nnum, A.globalRowStarts+1); for(int r=0;rExchange(globalIds, Np, ogs_hlong); + mesh.halo.Exchange(globalIds, Np); dlong nnzLocalBound = Np*Np*(1+Nfaces)*Nelements; @@ -375,7 +362,7 @@ void elliptic_t::BuildOperatorMatrixIpdgTri3D(parAlmond::parCOO& A){ dfloat tol = 1e-8; // surface mass matrices MS = MM*LIFT - dfloat *MS = (dfloat *) calloc(Nfaces*Nfp*Nfp,sizeof(dfloat)); + memory MS(Nfaces*Nfp*Nfp); for (int f=0;f SM(Np*Np); + memory SP(Np*Np); - if(rankM==0) {printf("Building full IPDG matrix...");fflush(stdout);} + if(Comm::World().rank()==0) {printf("Building full IPDG matrix...");fflush(stdout);} // loop over all elements for(dlong eM=0;eM MSf = MS+fM*Nfp*Nfp; // penalty term just involves face nodes for(int n=0;ntol){ + if(std::abs(val)>tol){ A.entries[nnz].row = globalIds[eM*Np + n]; A.entries[nnz].col = globalIds[eP*Np + m]; A.entries[nnz].val = val; @@ -566,7 +553,7 @@ void elliptic_t::BuildOperatorMatrixIpdgTri3D(parAlmond::parCOO& A){ for(int n=0;ntol){ + if(std::abs(val)>tol){ A.entries[nnz].row = globalIds[eM*Np + n]; A.entries[nnz].col = globalIds[eM*Np + m]; A.entries[nnz].val = val; @@ -578,32 +565,25 @@ void elliptic_t::BuildOperatorMatrixIpdgTri3D(parAlmond::parCOO& A){ //printf("nnz = %d\n", nnz); - std::sort(A.entries, A.entries+nnz, - [](const parAlmond::parCOO::nonZero_t& a, - const parAlmond::parCOO::nonZero_t& b) { - if (a.row < b.row) return true; - if (a.row > b.row) return false; + sort(A.entries.ptr(), A.entries.ptr()+nnz, + [](const parAlmond::parCOO::nonZero_t& a, + const parAlmond::parCOO::nonZero_t& b) { + if (a.row < b.row) return true; + if (a.row > b.row) return false; - return a.col < b.col; - }); + return a.col < b.col; + }); //*A = (parAlmond::parCOO::nonZero_t*) realloc(*A, nnz*sizeof(parAlmond::parCOO::nonZero_t)); A.nnz = nnz; - if(rankM==0) printf("done.\n"); - - free(globalIds); - - free(SM); free(SP); - free(MS); + if(Comm::World().rank()==0) printf("done.\n"); } void elliptic_t::BuildOperatorMatrixIpdgQuad2D(parAlmond::parCOO& A){ - int rankM = mesh.rank; - int Np = mesh.Np; int Nfaces = mesh.Nfaces; dlong Nelements = mesh.Nelements; @@ -611,35 +591,31 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad2D(parAlmond::parCOO& A){ hlong Nnum = mesh.Np*mesh.Nelements; // create a global numbering system - hlong *globalIds = (hlong *) calloc((Nelements+mesh.totalHaloPairs)*Np,sizeof(hlong)); + memory globalIds((Nelements+mesh.totalHaloPairs)*Np); // every degree of freedom has its own global id - A.globalRowStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong)); - A.globalColStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong)); - MPI_Allgather(&Nnum, 1, MPI_HLONG, A.globalRowStarts+1, 1, MPI_HLONG, mesh.comm); + A.globalRowStarts.malloc(mesh.size+1,0); + A.globalColStarts.malloc(mesh.size+1,0); + mesh.comm.Allgather(Nnum, A.globalRowStarts+1); for(int r=0;rExchange(globalIds, Np, ogs_hlong); + mesh.halo.Exchange(globalIds, Np); dlong nnzLocalBound = Np*Np*(1+Nfaces)*Nelements; @@ -647,9 +623,9 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad2D(parAlmond::parCOO& A){ dfloat tol = 1e-8; // build some monolithic basis arrays (use Dr,Ds,Dt and insert MM instead of weights for tet version) - dfloat *B = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat)); - dfloat *Br = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat)); - dfloat *Bs = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat)); + memory B (mesh.Np*mesh.Np, 0.0); + memory Br(mesh.Np*mesh.Np, 0.0); + memory Bs(mesh.Np*mesh.Np, 0.0); int mode = 0; for(int nj=0;njtol){ + if(std::abs(AnmP)>tol){ // remote info dlong eP = mesh.EToE[eM*mesh.Nfaces+fM]; A.entries[nnz].row = globalIds[eM*mesh.Np + n]; @@ -800,7 +776,7 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad2D(parAlmond::parCOO& A){ ++nnz; } } - if(fabs(Anm)>tol){ + if(std::abs(Anm)>tol){ // local block A.entries[nnz].row = globalIds[eM*mesh.Np+n]; A.entries[nnz].col = globalIds[eM*mesh.Np+m]; @@ -812,29 +788,24 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad2D(parAlmond::parCOO& A){ } // sort received non-zero entries by row block - std::sort(A.entries, A.entries+nnz, - [](const parAlmond::parCOO::nonZero_t& a, - const parAlmond::parCOO::nonZero_t& b) { - if (a.row < b.row) return true; - if (a.row > b.row) return false; + sort(A.entries.ptr(), A.entries.ptr()+nnz, + [](const parAlmond::parCOO::nonZero_t& a, + const parAlmond::parCOO::nonZero_t& b) { + if (a.row < b.row) return true; + if (a.row > b.row) return false; - return a.col < b.col; - }); + return a.col < b.col; + }); //*A = (parAlmond::parCOO::nonZero_t*) realloc(*A, nnz*sizeof(parAlmond::parCOO::nonZero_t)); A.nnz = nnz; - if(rankM==0) printf("done.\n"); - - free(globalIds); - free(B); free(Br); free(Bs); + if(Comm::World().rank()==0) printf("done.\n"); } void elliptic_t::BuildOperatorMatrixIpdgQuad3D(parAlmond::parCOO& A){ - int rankM = mesh.rank; - int Np = mesh.Np; int Nfaces = mesh.Nfaces; dlong Nelements = mesh.Nelements; @@ -842,34 +813,31 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad3D(parAlmond::parCOO& A){ hlong Nnum = mesh.Np*mesh.Nelements; // create a global numbering system - hlong *globalIds = (hlong *) calloc((Nelements+mesh.totalHaloPairs)*Np,sizeof(hlong)); + memory globalIds((Nelements+mesh.totalHaloPairs)*Np); // every degree of freedom has its own global id - A.globalRowStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong)); - A.globalColStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong)); - MPI_Allgather(&Nnum, 1, MPI_HLONG, A.globalRowStarts+1, 1, MPI_HLONG, mesh.comm); + A.globalRowStarts.malloc(mesh.size+1,0); + A.globalColStarts.malloc(mesh.size+1,0); + mesh.comm.Allgather(Nnum, A.globalRowStarts+1); for(int r=0;rExchange(globalIds, Np, ogs_hlong); + mesh.halo.Exchange(globalIds, Np); dlong nnzLocalBound = Np*Np*(1+Nfaces)*Nelements; @@ -877,9 +845,9 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad3D(parAlmond::parCOO& A){ dfloat tol = 1e-8; // build some monolithic basis arrays (use Dr,Ds,Dt and insert MM instead of weights for tet version) - dfloat *B = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat)); - dfloat *Br = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat)); - dfloat *Bs = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat)); + memory B (mesh.Np*mesh.Np, 0.0); + memory Br(mesh.Np*mesh.Np, 0.0); + memory Bs(mesh.Np*mesh.Np, 0.0); int mode = 0; for(int nj=0;njtol){ + if(std::abs(AnmP)>tol){ // remote info dlong eP = mesh.EToE[eM*mesh.Nfaces+fM]; A.entries[nnz].row = globalIds[eM*mesh.Np + n]; @@ -1039,7 +1007,7 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad3D(parAlmond::parCOO& A){ } } - if(fabs(Anm)>tol){ + if(std::abs(Anm)>tol){ // local block A.entries[nnz].row = globalIds[eM*mesh.Np+n]; A.entries[nnz].col = globalIds[eM*mesh.Np+m]; @@ -1051,19 +1019,19 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad3D(parAlmond::parCOO& A){ } // sort received non-zero entries by row block - std::sort(A.entries, A.entries+nnz, - [](const parAlmond::parCOO::nonZero_t& a, - const parAlmond::parCOO::nonZero_t& b) { - if (a.row < b.row) return true; - if (a.row > b.row) return false; + sort(A.entries.ptr(), A.entries.ptr()+nnz, + [](const parAlmond::parCOO::nonZero_t& a, + const parAlmond::parCOO::nonZero_t& b) { + if (a.row < b.row) return true; + if (a.row > b.row) return false; - return a.col < b.col; - }); + return a.col < b.col; + }); //*A = (parAlmond::parCOO::nonZero_t*) realloc(*A, nnz*sizeof(parAlmond::parCOO::nonZero_t)); A.nnz = nnz; - if(rankM==0) printf("done.\n"); + if(Comm::World().rank()==0) printf("done.\n"); #if 0 { @@ -1077,9 +1045,6 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad3D(parAlmond::parCOO& A){ fclose(fp); } #endif - - free(globalIds); - free(B); free(Br); free(Bs); } @@ -1089,41 +1054,35 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad3D(parAlmond::parCOO& A){ void elliptic_t::BuildOperatorMatrixIpdgTet3D(parAlmond::parCOO& A){ - int rankM = mesh.rank; - // number of degrees of freedom on this rank hlong Nnum = mesh.Np*mesh.Nelements; // create a global numbering system - hlong *globalIds = (hlong *) calloc((mesh.Nelements+mesh.totalHaloPairs)*mesh.Np,sizeof(hlong)); + memory globalIds((mesh.Nelements+mesh.totalHaloPairs)*mesh.Np); // every degree of freedom has its own global id - A.globalRowStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong)); - A.globalColStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong)); - MPI_Allgather(&Nnum, 1, MPI_HLONG, A.globalRowStarts+1, 1, MPI_HLONG, mesh.comm); + A.globalRowStarts.malloc(mesh.size+1,0); + A.globalColStarts.malloc(mesh.size+1,0); + mesh.comm.Allgather(Nnum, A.globalRowStarts+1); for(int r=0;rExchange(globalIds, mesh.Np, ogs_hlong); + mesh.halo.Exchange(globalIds, mesh.Np); dlong nnzLocalBound = mesh.Np*mesh.Np*(1+mesh.Nfaces)*mesh.Nelements; @@ -1131,7 +1090,7 @@ void elliptic_t::BuildOperatorMatrixIpdgTet3D(parAlmond::parCOO& A){ dfloat tol = 1e-8; // surface mass matrices MS = MM*LIFT - dfloat *MS = (dfloat *) calloc(mesh.Nfaces*mesh.Np*mesh.Nfp,sizeof(dfloat)); + memory MS(mesh.Nfaces*mesh.Np*mesh.Nfp); for (int f=0;f DrTMS(mesh.Nfaces*mesh.Np*mesh.Nfp); + memory DsTMS(mesh.Nfaces*mesh.Np*mesh.Nfp); + memory DtTMS(mesh.Nfaces*mesh.Np*mesh.Nfp); for (int f=0;f BM(mesh.Np*mesh.Np); - dfloat *qmP = (dfloat *) calloc(mesh.Nfp,sizeof(dfloat)); - dfloat *qmM = (dfloat *) calloc(mesh.Nfp,sizeof(dfloat)); - dfloat *ndotgradqmM = (dfloat *) calloc(mesh.Nfp,sizeof(dfloat)); - dfloat *ndotgradqmP = (dfloat *) calloc(mesh.Nfp,sizeof(dfloat)); + memory qmP(mesh.Nfp); + memory qmM(mesh.Nfp); + memory ndotgradqmM(mesh.Nfp); + memory ndotgradqmP(mesh.Nfp); //#pragma omp for for(dlong eM=0;eMtol){ + if(std::abs(AnmP)>tol){ //#pragma omp critical { // remote info @@ -1325,7 +1284,7 @@ void elliptic_t::BuildOperatorMatrixIpdgTet3D(parAlmond::parCOO& A){ for (int m=0;mtol){ + if(std::abs(Anm)>tol){ //#pragma omp critical { A.entries[nnz].row = globalIds[eM*mesh.Np+n]; @@ -1337,36 +1296,25 @@ void elliptic_t::BuildOperatorMatrixIpdgTet3D(parAlmond::parCOO& A){ } } } - - free(BM); - free(qmM); free(qmP); - free(ndotgradqmM); free(ndotgradqmP); } - std::sort(A.entries, A.entries+nnz, - [](const parAlmond::parCOO::nonZero_t& a, - const parAlmond::parCOO::nonZero_t& b) { - if (a.row < b.row) return true; - if (a.row > b.row) return false; + sort(A.entries.ptr(), A.entries.ptr()+nnz, + [](const parAlmond::parCOO::nonZero_t& a, + const parAlmond::parCOO::nonZero_t& b) { + if (a.row < b.row) return true; + if (a.row > b.row) return false; - return a.col < b.col; - }); + return a.col < b.col; + }); // free up unused storage //*A = (parAlmond::parCOO::nonZero_t*) realloc(*A, nnz*sizeof(parAlmond::parCOO::nonZero_t)); A.nnz = nnz; - if(rankM==0) printf("done.\n"); - - free(globalIds); - - free(MS); - free(DrTMS); free(DsTMS); free(DtTMS); + if(Comm::World().rank()==0) printf("done.\n"); } void elliptic_t::BuildOperatorMatrixIpdgHex3D(parAlmond::parCOO& A){ - int rankM = mesh.rank; - int Np = mesh.Np; int Nfaces = mesh.Nfaces; dlong Nelements = mesh.Nelements; @@ -1374,35 +1322,31 @@ void elliptic_t::BuildOperatorMatrixIpdgHex3D(parAlmond::parCOO& A){ hlong Nnum = mesh.Np*mesh.Nelements; // create a global numbering system - hlong *globalIds = (hlong *) calloc((Nelements+mesh.totalHaloPairs)*Np,sizeof(hlong)); + memory globalIds((Nelements+mesh.totalHaloPairs)*Np); // every degree of freedom has its own global id - A.globalRowStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong)); - A.globalColStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong)); - MPI_Allgather(&Nnum, 1, MPI_HLONG, A.globalRowStarts+1, 1, MPI_HLONG, mesh.comm); + A.globalRowStarts.malloc(mesh.size+1,0); + A.globalColStarts.malloc(mesh.size+1,0); + mesh.comm.Allgather(Nnum, A.globalRowStarts+1); for(int r=0;rExchange(globalIds, Np, ogs_hlong); + mesh.halo.Exchange(globalIds, Np); dlong nnzLocalBound = Np*Np*(1+Nfaces)*Nelements; @@ -1410,10 +1354,10 @@ void elliptic_t::BuildOperatorMatrixIpdgHex3D(parAlmond::parCOO& A){ dfloat tol = 1e-8; // build some monolithic basis arrays (use Dr,Ds,Dt and insert MM instead of weights for tet version) - dfloat *B = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat)); - dfloat *Br = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat)); - dfloat *Bs = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat)); - dfloat *Bt = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat)); + memory B (mesh.Np*mesh.Np, 0.0); + memory Br(mesh.Np*mesh.Np, 0.0); + memory Bs(mesh.Np*mesh.Np, 0.0); + memory Bt(mesh.Np*mesh.Np, 0.0); int mode = 0; for(int nk=0;nktol){ + if(std::abs(AnmP)>tol){ //#pragma omp critical { // remote info @@ -1595,7 +1539,7 @@ void elliptic_t::BuildOperatorMatrixIpdgHex3D(parAlmond::parCOO& A){ } } } - if(fabs(Anm)>tol){ + if(std::abs(Anm)>tol){ //#pragma omp critical { // local block @@ -1610,20 +1554,17 @@ void elliptic_t::BuildOperatorMatrixIpdgHex3D(parAlmond::parCOO& A){ } // sort received non-zero entries by row block - std::sort(A.entries, A.entries+nnz, - [](const parAlmond::parCOO::nonZero_t& a, - const parAlmond::parCOO::nonZero_t& b) { - if (a.row < b.row) return true; - if (a.row > b.row) return false; + sort(A.entries.ptr(), A.entries.ptr()+nnz, + [](const parAlmond::parCOO::nonZero_t& a, + const parAlmond::parCOO::nonZero_t& b) { + if (a.row < b.row) return true; + if (a.row > b.row) return false; - return a.col < b.col; - }); + return a.col < b.col; + }); //*A = (parAlmond::parCOO::nonZero_t*) realloc(*A, nnz*sizeof(parAlmond::parCOO::nonZero_t)); A.nnz = nnz; - if(rankM==0) printf("done.\n"); - - free(globalIds); - free(B); free(Br); free(Bs); free(Bt); + if(Comm::World().rank()==0) printf("done.\n"); } diff --git a/solvers/elliptic/src/ellipticOperator.cpp b/solvers/elliptic/src/ellipticOperator.cpp index fb5567d1d..6fa4cbb5d 100644 --- a/solvers/elliptic/src/ellipticOperator.cpp +++ b/solvers/elliptic/src/ellipticOperator.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) - Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,24 +26,25 @@ #include "elliptic.hpp" -void elliptic_t::Operator(occa::memory &o_q, occa::memory &o_Aq){ +void elliptic_t::Operator(deviceMemory &o_q, deviceMemory &o_Aq){ if(disc_c0){ - // int mapType = (mesh.elementType==HEXAHEDRA && + // int mapType = (mesh.elementType==Mesh::HEXAHEDRA && // mesh.settings.compareSetting("ELEMENT MAP", "TRILINEAR")) ? 1:0; - // int integrationType = (mesh.elementType==HEXAHEDRA && + // int integrationType = (mesh.elementType==Mesh::HEXAHEDRA && // settings.compareSetting("ELLIPTIC INTEGRATION", "CUBATURE")) ? 1:0; - ogsMasked->GatheredHaloExchangeStart(o_q, 1, ogs_dfloat); + gHalo.ExchangeStart(o_q, 1); - if(mesh.NlocalGatherElements){ + if(mesh.NlocalGatherElements/2){ // if(integrationType==0) { // GLL or non-hex // if(mapType==0) - partialAxKernel(mesh.NlocalGatherElements, + partialAxKernel(mesh.NlocalGatherElements/2, mesh.o_localGatherElementList, - ogsMasked->o_GlobalToLocal, - mesh.o_ggeo, mesh.o_D, mesh.o_S, + o_GlobalToLocal, + mesh.o_wJ, mesh.o_ggeo, + mesh.o_D, mesh.o_S, mesh.o_MM, lambda, o_q, o_AqL); /* NC: disabling until we re-add treatment of affine elements else @@ -63,7 +64,7 @@ void elliptic_t::Operator(occa::memory &o_q, occa::memory &o_Aq){ } // finalize halo exchange - ogsMasked->GatheredHaloExchangeFinish(o_q, 1, ogs_dfloat); + gHalo.ExchangeFinish(o_q, 1); if(mesh.NglobalGatherElements) { @@ -71,8 +72,9 @@ void elliptic_t::Operator(occa::memory &o_q, occa::memory &o_Aq){ // if(mapType==0) partialAxKernel(mesh.NglobalGatherElements, mesh.o_globalGatherElementList, - ogsMasked->o_GlobalToLocal, - mesh.o_ggeo, mesh.o_D, mesh.o_S, + o_GlobalToLocal, + mesh.o_wJ, mesh.o_ggeo, + mesh.o_D, mesh.o_S, mesh.o_MM, lambda, o_q, o_AqL); /* NC: disabling until we re-add treatment of affine elements else @@ -90,7 +92,18 @@ void elliptic_t::Operator(occa::memory &o_q, occa::memory &o_Aq){ } //gather result to Aq - ogsMasked->Gather(o_Aq, o_AqL, ogs_dfloat, ogs_add, ogs_trans); + ogsMasked.GatherStart(o_Aq, o_AqL, 1, ogs::Add, ogs::Trans); + + if((mesh.NlocalGatherElements+1)/2){ + partialAxKernel((mesh.NlocalGatherElements+1)/2, + mesh.o_localGatherElementList+(mesh.NlocalGatherElements/2), + o_GlobalToLocal, + mesh.o_wJ, mesh.o_ggeo, + mesh.o_D, mesh.o_S, + mesh.o_MM, lambda, o_q, o_AqL); + } + + ogsMasked.GatherFinish(o_Aq, o_AqL, 1, ogs::Add, ogs::Trans); } else if(disc_ipdg) { @@ -105,7 +118,7 @@ void elliptic_t::Operator(occa::memory &o_q, occa::memory &o_Aq){ } // dfloat4 storage -> 4 entries - traceHalo->ExchangeStart(o_grad, 4, ogs_dfloat); + traceHalo.ExchangeStart(o_grad, 4); if(mesh.NinternalElements) partialIpdgKernel(mesh.NinternalElements, @@ -123,7 +136,7 @@ void elliptic_t::Operator(occa::memory &o_q, occa::memory &o_Aq){ o_grad, o_Aq); - traceHalo->ExchangeFinish(o_grad, 4, ogs_dfloat); + traceHalo.ExchangeFinish(o_grad, 4); if(mesh.NhaloElements) { partialIpdgKernel(mesh.NhaloElements, diff --git a/solvers/elliptic/src/ellipticPlotFields.cpp b/solvers/elliptic/src/ellipticPlotFields.cpp index ea3437167..8df1af8ba 100644 --- a/solvers/elliptic/src/ellipticPlotFields.cpp +++ b/solvers/elliptic/src/ellipticPlotFields.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,11 +27,11 @@ SOFTWARE. #include "elliptic.hpp" // interpolate data to plot nodes and save to file (one per process -void elliptic_t::PlotFields(dfloat* Q, char *fileName){ +void elliptic_t::PlotFields(memory& Q, std::string fileName){ FILE *fp; - fp = fopen(fileName, "w"); + fp = fopen(fileName.c_str(), "w"); fprintf(fp, "\n"); fprintf(fp, " \n"); @@ -44,30 +44,36 @@ void elliptic_t::PlotFields(dfloat* Q, char *fileName){ fprintf(fp, " \n"); //scratch space for interpolation - size_t NscratchBytes = mymax(mesh.Np, mesh.plotNp)*sizeof(dfloat); - dfloat* scratch = (dfloat *) malloc(2*NscratchBytes); + size_t Nscratch = std::max(mesh.Np, mesh.plotNp); + memory scratch(2*Nscratch); - dfloat* Ix = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iy = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iz = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); + memory Ix(mesh.plotNp); + memory Iy(mesh.plotNp); + memory Iz(mesh.plotNp); // compute plot node coordinates on the fly for(dlong e=0;e\n"); fprintf(fp, " \n"); - free(Ix); free(Iy); free(Iz); - - dfloat* Iq = (dfloat *) malloc(mesh.plotNp*Nfields*sizeof(dfloat)); + memory Iq(mesh.plotNp*Nfields); // write out fields fprintf(fp, " \n"); @@ -86,8 +92,6 @@ void elliptic_t::PlotFields(dfloat* Q, char *fileName){ fprintf(fp, " \n"); fprintf(fp, " \n"); - free(Iq); - fprintf(fp, " \n"); fprintf(fp, " \n"); @@ -128,6 +132,4 @@ void elliptic_t::PlotFields(dfloat* Q, char *fileName){ fprintf(fp, " \n"); fprintf(fp, "\n"); fclose(fp); - - free(scratch); } diff --git a/solvers/elliptic/src/ellipticPreconJacobi.cpp b/solvers/elliptic/src/ellipticPreconJacobi.cpp index 0dc1694ec..805222ed1 100644 --- a/solvers/elliptic/src/ellipticPreconJacobi.cpp +++ b/solvers/elliptic/src/ellipticPreconJacobi.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -30,23 +30,22 @@ SOFTWARE. JacobiPrecon::JacobiPrecon(elliptic_t& _elliptic): elliptic(_elliptic) { - dfloat *diagA = (dfloat*) calloc(elliptic.Ndofs, sizeof(dfloat)); - dfloat *invDiagA = (dfloat*) calloc(elliptic.Ndofs, sizeof(dfloat)); + memory diagA (elliptic.Ndofs); + memory invDiagA(elliptic.Ndofs); elliptic.BuildOperatorDiagonal(diagA); for (dlong n=0;n(invDiagA); } -void JacobiPrecon::Operator(occa::memory& o_r, occa::memory& o_Mr) { +void JacobiPrecon::Operator(deviceMemory& o_r, deviceMemory& o_Mr) { + + linAlg_t& linAlg = elliptic.platform.linAlg(); // Mr = invDiag.*r - elliptic.linAlg.amxpy(elliptic.Ndofs, 1.0, o_invDiagA, o_r, 0.0, o_Mr); + linAlg.amxpy(elliptic.Ndofs, 1.0, o_invDiagA, o_r, 0.0, o_Mr); // zero mean of RHS if(elliptic.allNeumann) elliptic.ZeroMean(o_Mr); -} \ No newline at end of file +} diff --git a/solvers/elliptic/src/ellipticPreconMassMatrix.cpp b/solvers/elliptic/src/ellipticPreconMassMatrix.cpp index d425e8219..b05c2237e 100644 --- a/solvers/elliptic/src/ellipticPreconMassMatrix.cpp +++ b/solvers/elliptic/src/ellipticPreconMassMatrix.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -31,67 +31,80 @@ MassMatrixPrecon::MassMatrixPrecon(elliptic_t& _elliptic): elliptic(_elliptic), mesh(_elliptic.mesh), settings(_elliptic.settings) { //sanity checking - if (mesh.elementType!=TRIANGLES && mesh.elementType!=TETRAHEDRA ) - LIBP_ABORT(string("MASSMATRIX preconditioner is only available for triangle and tetrhedra elements. Use JACOBI instead.")); + LIBP_ABORT("MASSMATRIX preconditioner is only available for triangle and tetrhedra elements. Use JACOBI instead.", + mesh.elementType!=Mesh::TRIANGLES && mesh.elementType!=Mesh::TETRAHEDRA); - if (elliptic.lambda==0) - LIBP_ABORT(string("MASSMATRIX preconditioner is unavailble when lambda=0.")); + LIBP_ABORT("MASSMATRIX preconditioner is unavailble when lambda=0.", + elliptic.lambda==0); - o_invMM = elliptic.platform.malloc(mesh.Np*mesh.Np*sizeof(dfloat), mesh.invMM); + o_invMM = elliptic.platform.malloc(mesh.invMM); // OCCA build stuff - occa::properties kernelInfo = elliptic.mesh.props; //copy base occa properties + properties_t kernelInfo = mesh.props; //copy base occa properties int blockMax = 256; if (elliptic.platform.device.mode() == "CUDA") blockMax = 512; - int NblockV = mymax(1,blockMax/mesh.Np); + int NblockV = std::max(1,blockMax/mesh.Np); kernelInfo["defines/" "p_NblockV"]= NblockV; if (settings.compareSetting("DISCRETIZATION", "IPDG")) { blockJacobiKernel = elliptic.platform.buildKernel(DELLIPTIC "/okl/ellipticPreconBlockJacobi.okl", "blockJacobi", kernelInfo); } else if (settings.compareSetting("DISCRETIZATION", "CONTINUOUS")) { - dlong Ntotal = elliptic.ogsMasked->Ngather + elliptic.ogsMasked->NgatherHalo; - o_rtmp = elliptic.platform.malloc(Ntotal*sizeof(dfloat)); - o_MrL = elliptic.platform.malloc(mesh.Np*mesh.Nelements*sizeof(dfloat)); + dlong Ntotal = elliptic.ogsMasked.Ngather + elliptic.gHalo.Nhalo; + o_rtmp = elliptic.platform.malloc(Ntotal); + o_MrL = elliptic.platform.malloc(mesh.Np*mesh.Nelements); partialBlockJacobiKernel = elliptic.platform.buildKernel(DELLIPTIC "/okl/ellipticPreconBlockJacobi.okl", "partialBlockJacobi", kernelInfo); } } -void MassMatrixPrecon::Operator(occa::memory& o_r, occa::memory& o_Mr) { +void MassMatrixPrecon::Operator(deviceMemory& o_r, deviceMemory& o_Mr) { dfloat invLambda = 1./elliptic.lambda; + linAlg_t& linAlg = elliptic.platform.linAlg(); + if (elliptic.disc_c0) {//C0 // rtmp = invDegree.*r - elliptic.linAlg.amxpy(elliptic.Ndofs, 1.0, elliptic.o_weightG, o_r, 0.0, o_rtmp); + linAlg.amxpy(elliptic.Ndofs, 1.0, elliptic.o_weightG, o_r, 0.0, o_rtmp); - elliptic.ogsMasked->GatheredHaloExchangeStart(o_rtmp, 1, ogs_dfloat); + elliptic.gHalo.ExchangeStart(o_rtmp, 1); - if(mesh.NlocalGatherElements) - partialBlockJacobiKernel(mesh.NlocalGatherElements, + if(mesh.NlocalGatherElements/2) + partialBlockJacobiKernel(mesh.NlocalGatherElements/2, mesh.o_localGatherElementList, - elliptic.ogsMasked->o_GlobalToLocal, + elliptic.o_GlobalToLocal, invLambda, mesh.o_vgeo, o_invMM, o_rtmp, o_MrL); - elliptic.ogsMasked->GatheredHaloExchangeFinish(o_rtmp, 1, ogs_dfloat); + // finalize halo exchange + elliptic.gHalo.ExchangeFinish(o_rtmp, 1); if(mesh.NglobalGatherElements) partialBlockJacobiKernel(mesh.NglobalGatherElements, mesh.o_globalGatherElementList, - elliptic.ogsMasked->o_GlobalToLocal, + elliptic.o_GlobalToLocal, invLambda, mesh.o_vgeo, o_invMM, o_rtmp, o_MrL); //gather result to Aq - elliptic.ogsMasked->Gather(o_Mr, o_MrL, ogs_dfloat, ogs_add, ogs_trans); + elliptic.ogsMasked.GatherStart(o_Mr, o_MrL, 1, ogs::Add, ogs::Trans); + + if((mesh.NlocalGatherElements+1)/2){ + partialBlockJacobiKernel((mesh.NlocalGatherElements+1)/2, + mesh.o_localGatherElementList+mesh.NlocalGatherElements/2, + elliptic.o_GlobalToLocal, + invLambda, mesh.o_vgeo, o_invMM, + o_rtmp, o_MrL); + } + + elliptic.ogsMasked.GatherFinish(o_Mr, o_MrL, 1, ogs::Add, ogs::Trans); // Mr = invDegree.*Mr - elliptic.linAlg.amx(elliptic.Ndofs, 1.0, elliptic.o_weightG, o_Mr); + linAlg.amx(elliptic.Ndofs, 1.0, elliptic.o_weightG, o_Mr); } else { //IPDG @@ -101,8 +114,3 @@ void MassMatrixPrecon::Operator(occa::memory& o_r, occa::memory& o_Mr) { // zero mean of RHS if(elliptic.allNeumann) elliptic.ZeroMean(o_Mr); } - -MassMatrixPrecon::~MassMatrixPrecon(){ - blockJacobiKernel.free(); - partialBlockJacobiKernel.free(); -} \ No newline at end of file diff --git a/solvers/elliptic/src/ellipticPreconMultiGrid.cpp b/solvers/elliptic/src/ellipticPreconMultiGrid.cpp index a6b027df9..169e5c625 100644 --- a/solvers/elliptic/src/ellipticPreconMultiGrid.cpp +++ b/solvers/elliptic/src/ellipticPreconMultiGrid.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -28,7 +28,7 @@ SOFTWARE. // Matrix-free p-Multigrid levels followed by AMG -void MultiGridPrecon::Operator(occa::memory& o_r, occa::memory& o_Mr) { +void MultiGridPrecon::Operator(deviceMemory& o_r, deviceMemory& o_Mr) { //just pass to parAlmond parAlmond.Operator(o_r, o_Mr); @@ -46,37 +46,38 @@ MultiGridPrecon::MultiGridPrecon(elliptic_t& _elliptic): int NpFine = mesh.Np; int NpCoarse = mesh.Np; - MGLevel* prevLevel=nullptr; - MGLevel* currLevel=nullptr; - while(Nc>1) { + if (Comm::World().rank()==0){ + printf("-----------------------------Multigrid pMG Degree %2d----------------------------------------\n", Nc); + } //build mesh and elliptic objects for this degree - mesh_t &meshF = mesh.SetupNewDegree(Nf); - elliptic_t &ellipticF = elliptic.SetupNewDegree(meshF); + mesh_t meshF = mesh.SetupNewDegree(Nf); + elliptic_t ellipticF = elliptic.SetupNewDegree(meshF); //share masking data with previous MG level - if (prevLevel) { - prevLevel->meshC = &meshF; - prevLevel->ogsMaskedC = ellipticF.ogsMasked; + if (parAlmond.NumLevels()>0) { + MGLevel& prevLevel = parAlmond.GetLevel(parAlmond.NumLevels()-1); + prevLevel.meshC = meshF; + prevLevel.ellipticC = ellipticF; } //find the degree of the next level if (settings.compareSetting("MULTIGRID COARSENING","ALLDEGREES")) { Nc = Nf-1; } else if (settings.compareSetting("MULTIGRID COARSENING","HALFDEGREES")) { - Nc = mymax(1,(Nf+1)/2); + Nc = std::max(1,(Nf+1)/2); } else { //default "HALFDOFS" // pick the degrees so the dofs of each level halfs (roughly) while (NpCoarse > NpFine/2 && Nc>1) { Nc--; switch(mesh.elementType){ - case TRIANGLES: + case Mesh::TRIANGLES: NpCoarse = ((Nc+1)*(Nc+2))/2; break; - case QUADRILATERALS: + case Mesh::QUADRILATERALS: NpCoarse = (Nc+1)*(Nc+1); break; - case TETRAHEDRA: + case Mesh::TETRAHEDRA: NpCoarse = ((Nc+1)*(Nc+2)*(Nc+3))/6; break; - case HEXAHEDRA: + case Mesh::HEXAHEDRA: NpCoarse = (Nc+1)*(Nc+1)*(Nc+1); break; } } @@ -84,45 +85,50 @@ MultiGridPrecon::MultiGridPrecon(elliptic_t& _elliptic): //set Npcoarse switch(mesh.elementType){ - case TRIANGLES: + case Mesh::TRIANGLES: NpCoarse = ((Nc+1)*(Nc+2))/2; break; - case QUADRILATERALS: + case Mesh::QUADRILATERALS: NpCoarse = (Nc+1)*(Nc+1); break; - case TETRAHEDRA: + case Mesh::TETRAHEDRA: NpCoarse = ((Nc+1)*(Nc+2)*(Nc+3))/6; break; - case HEXAHEDRA: + case Mesh::HEXAHEDRA: NpCoarse = (Nc+1)*(Nc+1)*(Nc+1); break; } dlong Nrows, Ncols; if (settings.compareSetting("DISCRETIZATION", "CONTINUOUS")) { - Nrows = ellipticF.ogsMasked->Ngather; - Ncols = Nrows + ellipticF.ogsMasked->NgatherHalo; + Nrows = ellipticF.ogsMasked.Ngather; + Ncols = Nrows + ellipticF.gHalo.Nhalo; } else { Nrows = meshF.Nelements*meshF.Np; Ncols = Nrows + meshF.totalHaloPairs*mesh.Np; } - //make a multigrid level - currLevel = new MGLevel(ellipticF, Nrows, Ncols, Nc, NpCoarse); - parAlmond.AddLevel(currLevel); + //Add a multigrid level + parAlmond.AddLevel(ellipticF, Nrows, Ncols, Nc, NpCoarse); Nf = Nc; NpFine = NpCoarse; - prevLevel = currLevel; } //build matrix at degree 1 - mesh_t &meshF = mesh.SetupNewDegree(1); - elliptic_t &ellipticF = elliptic.SetupNewDegree(meshF); + if (Comm::World().rank()==0){ + printf("-----------------------------Multigrid pMG Degree 1----------------------------------------\n"); + } + mesh_t meshF = mesh.SetupNewDegree(1); + elliptic_t ellipticF = elliptic.SetupNewDegree(meshF); //share masking data with previous MG level - if (prevLevel) { - prevLevel->meshC = &meshF; - prevLevel->ogsMaskedC = ellipticF.ogsMasked; + if (parAlmond.NumLevels()>0) { + MGLevel& prevLevel = parAlmond.GetLevel(parAlmond.NumLevels()-1); + prevLevel.meshC = meshF; + prevLevel.ellipticC = ellipticF; } //build full A matrix and pass to parAlmond + if (Comm::World().rank()==0){ + printf("-----------------------------Multigrid AMG Setup--------------------------------------------\n"); + } parAlmond::parCOO A(elliptic.platform, mesh.comm); if (settings.compareSetting("DISCRETIZATION", "IPDG")) ellipticF.BuildOperatorMatrixIpdg(A); @@ -133,13 +139,15 @@ MultiGridPrecon::MultiGridPrecon(elliptic_t& _elliptic): int rank = mesh.rank; int size = mesh.size; hlong TotalRows = A.globalRowStarts[size]; - dlong numLocalRows = (dlong) (A.globalRowStarts[rank+1]-A.globalRowStarts[rank]); - dfloat *null = (dfloat *) malloc(numLocalRows*sizeof(dfloat)); - for (dlong i=0;i(A.globalRowStarts[rank+1]-A.globalRowStarts[rank]); + + memory null(numLocalRows); + for (dlong i=0;i& o_X, deviceMemory& o_Ax) { elliptic.Operator(o_X,o_Ax); } -void MGLevel::residual(occa::memory &o_RHS, occa::memory &o_X, occa::memory &o_RES) { +void MGLevel::residual(deviceMemory& o_RHS, deviceMemory& o_X, deviceMemory& o_RES) { elliptic.Operator(o_X,o_RES); // subtract res = rhs - A*x - linAlg.axpy(elliptic.Ndofs, 1.f, o_RHS, -1.f, o_RES); + platform.linAlg().axpy(elliptic.Ndofs, 1.f, o_RHS, -1.f, o_RES); } -void MGLevel::coarsen(occa::memory &o_X, occa::memory &o_Rx) { +void MGLevel::coarsen(deviceMemory& o_X, deviceMemory& o_Rx) { + + linAlg_t& linAlg = platform.linAlg(); if (elliptic.disc_c0) { //scratch spaces - occa::memory &o_wx = o_smootherResidual; - occa::memory &o_RxL = o_transferScratch; + deviceMemory& o_wx = o_smootherResidual; + deviceMemory& o_RxL = o_transferScratch; //pre-weight linAlg.amxpy(elliptic.Ndofs, 1.0, elliptic.o_weightG, o_X, 0.0, o_wx); - elliptic.ogsMasked->GatheredHaloExchangeStart(o_wx, 1, ogs_dfloat); + elliptic.gHalo.ExchangeStart(o_wx, 1); - if(mesh.NlocalGatherElements) - partialCoarsenKernel(mesh.NlocalGatherElements, + if(mesh.NlocalGatherElements/2) + partialCoarsenKernel(mesh.NlocalGatherElements/2, mesh.o_localGatherElementList, - elliptic.ogsMasked->o_GlobalToLocal, + elliptic.o_GlobalToLocal, o_P, o_wx, o_RxL); - elliptic.ogsMasked->GatheredHaloExchangeFinish(o_wx, 1, ogs_dfloat); + elliptic.gHalo.ExchangeFinish(o_wx, 1); if(mesh.NglobalGatherElements) partialCoarsenKernel(mesh.NglobalGatherElements, mesh.o_globalGatherElementList, - elliptic.ogsMasked->o_GlobalToLocal, + elliptic.o_GlobalToLocal, + o_P, o_wx, o_RxL); + + ellipticC.ogsMasked.GatherStart(o_Rx, o_RxL, 1, ogs::Add, ogs::Trans); + + if((mesh.NlocalGatherElements+1)/2) + partialCoarsenKernel((mesh.NlocalGatherElements+1)/2, + mesh.o_localGatherElementList + mesh.NlocalGatherElements/2, + elliptic.o_GlobalToLocal, o_P, o_wx, o_RxL); - ogsMaskedC->Gather(o_Rx, o_RxL, ogs_dfloat, ogs_add, ogs_trans); + ellipticC.ogsMasked.GatherFinish(o_Rx, o_RxL, 1, ogs::Add, ogs::Trans); } else { coarsenKernel(mesh.Nelements, o_P, o_X, o_Rx); } } -void MGLevel::prolongate(occa::memory &o_X, occa::memory &o_Px) { +void MGLevel::prolongate(deviceMemory& o_X, deviceMemory& o_Px) { + + linAlg_t& linAlg = platform.linAlg(); + if (elliptic.disc_c0) { //scratch spaces - occa::memory &o_PxG = o_smootherResidual; - occa::memory &o_PxL = o_transferScratch; + deviceMemory& o_PxG = o_smootherResidual; + deviceMemory& o_PxL = o_transferScratch; - ogsMaskedC->GatheredHaloExchangeStart(o_X, 1, ogs_dfloat); + ellipticC.gHalo.ExchangeStart(o_X, 1); - if(mesh.NlocalGatherElements) - partialProlongateKernel(meshC->NlocalGatherElements, - meshC->o_localGatherElementList, - ogsMaskedC->o_GlobalToLocal, + if(meshC.NlocalGatherElements/2) + partialProlongateKernel(meshC.NlocalGatherElements/2, + meshC.o_localGatherElementList, + ellipticC.o_GlobalToLocal, o_P, o_X, o_PxL); - ogsMaskedC->GatheredHaloExchangeFinish(o_X, 1, ogs_dfloat); + ellipticC.gHalo.ExchangeFinish(o_X, 1); - if(mesh.NglobalGatherElements) - partialProlongateKernel(meshC->NglobalGatherElements, - meshC->o_globalGatherElementList, - ogsMaskedC->o_GlobalToLocal, + if(meshC.NglobalGatherElements) + partialProlongateKernel(meshC.NglobalGatherElements, + meshC.o_globalGatherElementList, + ellipticC.o_GlobalToLocal, o_P, o_X, o_PxL); //ogs_notrans -> no summation at repeated nodes, just one value - elliptic.ogsMasked->Gather(o_PxG, o_PxL, ogs_dfloat, ogs_add, ogs_notrans); + elliptic.ogsMasked.GatherStart(o_PxG, o_PxL, 1, ogs::Add, ogs::NoTrans); + + if((meshC.NlocalGatherElements+1)/2) + partialProlongateKernel((meshC.NlocalGatherElements+1)/2, + meshC.o_localGatherElementList + meshC.NlocalGatherElements/2, + ellipticC.o_GlobalToLocal, + o_P, o_X, o_PxL); + + elliptic.ogsMasked.GatherFinish(o_PxG, o_PxL, 1, ogs::Add, ogs::NoTrans); linAlg.axpy(elliptic.Ndofs, 1.f, o_PxG, 1.f, o_Px); @@ -103,7 +124,7 @@ void MGLevel::prolongate(occa::memory &o_X, occa::memory &o_Px) { } } -void MGLevel::smooth(occa::memory &o_RHS, occa::memory &o_X, bool x_is_zero) { +void MGLevel::smooth(deviceMemory& o_RHS, deviceMemory& o_X, bool x_is_zero) { if (stype==JACOBI) { smoothJacobi(o_RHS, o_X, x_is_zero); } else if (stype==CHEBYSHEV) { @@ -111,9 +132,11 @@ void MGLevel::smooth(occa::memory &o_RHS, occa::memory &o_X, bool x_is_zero) { } } -void MGLevel::smoothJacobi(occa::memory &o_r, occa::memory &o_X, bool xIsZero) { +void MGLevel::smoothJacobi(deviceMemory& o_r, deviceMemory& o_X, bool xIsZero) { - occa::memory &o_RES = o_smootherResidual; + linAlg_t& linAlg = platform.linAlg(); + + deviceMemory& o_RES = o_smootherResidual; if (xIsZero) { linAlg.amxpy(elliptic.Ndofs, 1.0, o_invDiagA, o_r, 0.0, o_X); @@ -128,7 +151,7 @@ void MGLevel::smoothJacobi(occa::memory &o_r, occa::memory &o_X, bool xIsZero) { linAlg.amxpy(elliptic.Ndofs, 1.0, o_invDiagA, o_RES, 1.0, o_X); } -void MGLevel::smoothChebyshev (occa::memory &o_r, occa::memory &o_X, bool xIsZero) { +void MGLevel::smoothChebyshev (deviceMemory& o_r, deviceMemory& o_X, bool xIsZero) { const dfloat theta = 0.5*(lambda1+lambda0); const dfloat delta = 0.5*(lambda1-lambda0); @@ -137,9 +160,11 @@ void MGLevel::smoothChebyshev (occa::memory &o_r, occa::memory &o_X, bool xIsZer dfloat rho_n = 1./sigma; dfloat rho_np1; - occa::memory &o_RES = o_smootherResidual; - occa::memory &o_Ad = o_smootherResidual2; - occa::memory &o_d = o_smootherUpdate; + deviceMemory& o_RES = o_smootherResidual; + deviceMemory& o_Ad = o_smootherResidual2; + deviceMemory& o_d = o_smootherUpdate; + + linAlg_t& linAlg = platform.linAlg(); if(xIsZero){ //skip the Ax if x is zero //res = S*r @@ -187,56 +212,56 @@ void MGLevel::smoothChebyshev (occa::memory &o_r, occa::memory &o_X, bool xIsZer * *******************************************/ -size_t MGLevel::smootherResidualBytes=0; -size_t MGLevel::scratchBytes=0; -dfloat* MGLevel::smootherResidual=nullptr; -occa::memory MGLevel::o_smootherResidual; -occa::memory MGLevel::o_smootherResidual2; -occa::memory MGLevel::o_smootherUpdate; -occa::memory MGLevel::o_transferScratch; +dlong MGLevel::NsmootherResidual=0; +dlong MGLevel::Nscratch=0; +memory MGLevel::smootherResidual; +deviceMemory MGLevel::o_smootherResidual; +deviceMemory MGLevel::o_smootherResidual2; +deviceMemory MGLevel::o_smootherUpdate; +deviceMemory MGLevel::o_transferScratch; //build a level and connect it to the next one MGLevel::MGLevel(elliptic_t& _elliptic, dlong _Nrows, dlong _Ncols, int Nc, int NpCoarse): multigridLevel(_Nrows, _Ncols, - _elliptic.platform, _elliptic.settings), + _elliptic.platform, + _elliptic.settings, + _elliptic.comm), elliptic(_elliptic), - mesh(_elliptic.mesh), - linAlg(_elliptic.linAlg) { + mesh(_elliptic.mesh) { SetupSmoother(); AllocateStorage(); - if (mesh.elementType==QUADRILATERALS || mesh.elementType==HEXAHEDRA) { - P = (dfloat *) calloc((mesh.N+1)*(Nc+1),sizeof(dfloat)); + if ( mesh.elementType==Mesh::QUADRILATERALS + || mesh.elementType==Mesh::HEXAHEDRA) { mesh.DegreeRaiseMatrix1D(Nc, mesh.N, P); - o_P = elliptic.platform.malloc((mesh.N+1)*(Nc+1)*sizeof(dfloat), P); - } else if (mesh.elementType==TRIANGLES) { - P = (dfloat *) calloc(mesh.Np*NpCoarse,sizeof(dfloat)); + } else if (mesh.elementType==Mesh::TRIANGLES) { mesh.DegreeRaiseMatrixTri2D(Nc, mesh.N, P); - o_P = elliptic.platform.malloc(mesh.Np*NpCoarse*sizeof(dfloat), P); - } else { - P = (dfloat *) calloc(mesh.Np*NpCoarse,sizeof(dfloat)); + } else { //Mesh::TETRAHEDRA mesh.DegreeRaiseMatrixTet3D(Nc, mesh.N, P); - o_P = elliptic.platform.malloc(mesh.Np*NpCoarse*sizeof(dfloat), P); } + o_P = elliptic.platform.malloc(P); //build kernels - occa::properties kernelInfo = elliptic.platform.props; + properties_t kernelInfo = elliptic.platform.props(); // set kernel name suffix - char *suffix; - if(mesh.elementType==TRIANGLES) - suffix = strdup("Tri2D"); - else if(mesh.elementType==QUADRILATERALS) - suffix = strdup("Quad2D"); - else if(mesh.elementType==TETRAHEDRA) - suffix = strdup("Tet3D"); - else if(mesh.elementType==HEXAHEDRA) - suffix = strdup("Hex3D"); - - char fileName[BUFSIZ], kernelName[BUFSIZ]; + std::string suffix; + if(mesh.elementType==Mesh::TRIANGLES) + suffix = "Tri2D"; + else if(mesh.elementType==Mesh::QUADRILATERALS) + suffix = "Quad2D"; + else if(mesh.elementType==Mesh::TETRAHEDRA) + suffix = "Tet3D"; + else if(mesh.elementType==Mesh::HEXAHEDRA) + suffix = "Hex3D"; + + std::string oklFilePrefix = DELLIPTIC "/okl/"; + std::string oklFileSuffix = ".okl"; + + std::string fileName, kernelName; kernelInfo["defines/" "p_NqFine"]= mesh.N+1; kernelInfo["defines/" "p_NqCoarse"]= Nc+1; @@ -247,74 +272,60 @@ MGLevel::MGLevel(elliptic_t& _elliptic, int blockMax = 256; if (elliptic.platform.device.mode() == "CUDA") blockMax = 512; - int NblockVFine = mymax(1,blockMax/mesh.Np); - int NblockVCoarse = mymax(1,blockMax/NpCoarse); + int NblockVFine = std::max(1,blockMax/mesh.Np); + int NblockVCoarse = std::max(1,blockMax/NpCoarse); kernelInfo["defines/" "p_NblockVFine"]= NblockVFine; kernelInfo["defines/" "p_NblockVCoarse"]= NblockVCoarse; if (settings.compareSetting("DISCRETIZATION", "CONTINUOUS")) { - sprintf(fileName, DELLIPTIC "/okl/ellipticPreconCoarsen%s.okl", suffix); - sprintf(kernelName, "ellipticPartialPreconCoarsen%s", suffix); + fileName = oklFilePrefix + "ellipticPreconCoarsen" + suffix + oklFileSuffix; + kernelName = "ellipticPartialPreconCoarsen" + suffix; partialCoarsenKernel = elliptic.platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(fileName, DELLIPTIC "/okl/ellipticPreconProlongate%s.okl", suffix); - sprintf(kernelName, "ellipticPartialPreconProlongate%s", suffix); + fileName = oklFilePrefix + "ellipticPreconProlongate" + suffix + oklFileSuffix; + kernelName = "ellipticPartialPreconProlongate" + suffix; partialProlongateKernel = elliptic.platform.buildKernel(fileName, kernelName, kernelInfo); } else { //IPDG - sprintf(fileName, DELLIPTIC "/okl/ellipticPreconCoarsen%s.okl", suffix); - sprintf(kernelName, "ellipticPreconCoarsen%s", suffix); + fileName = oklFilePrefix + "ellipticPreconCoarsen" + suffix + oklFileSuffix; + kernelName = "ellipticPreconCoarsen" + suffix; coarsenKernel = elliptic.platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(fileName, DELLIPTIC "/okl/ellipticPreconProlongate%s.okl", suffix); - sprintf(kernelName, "ellipticPreconProlongate%s", suffix); + fileName = oklFilePrefix + "ellipticPreconProlongate" + suffix + oklFileSuffix; + kernelName = "ellipticPreconProlongate" + suffix; prolongateKernel = elliptic.platform.buildKernel(fileName, kernelName, kernelInfo); } } void MGLevel::AllocateStorage() { // extra storage for smoothing op - size_t Nbytes = Ncols*sizeof(dfloat); - if (smootherResidualBytes < Nbytes) { - if (o_smootherResidual.size()) { - free(smootherResidual); - o_smootherResidual.free(); - o_smootherResidual2.free(); - o_smootherUpdate.free(); - } - - smootherResidual = (dfloat *) calloc(Ncols,sizeof(dfloat)); - o_smootherResidual = elliptic.platform.malloc(Nbytes,smootherResidual); - o_smootherResidual2 = elliptic.platform.malloc(Nbytes,smootherResidual); - o_smootherUpdate = elliptic.platform.malloc(Nbytes,smootherResidual); - smootherResidualBytes = Nbytes; + if (NsmootherResidual < Ncols) { + smootherResidual.malloc(Ncols, 0); + o_smootherResidual = elliptic.platform.malloc(smootherResidual); + o_smootherResidual2 = elliptic.platform.malloc(smootherResidual); + o_smootherUpdate = elliptic.platform.malloc(smootherResidual); + NsmootherResidual = Ncols; } - Nbytes = mesh.Nelements*mesh.Np*sizeof(dfloat); - if (scratchBytes < Nbytes) { - if (o_transferScratch.size()) { - o_transferScratch.free(); - } - dfloat *dummy = (dfloat *) calloc(mesh.Nelements*mesh.Np,sizeof(dfloat)); - o_transferScratch = elliptic.platform.malloc(Nbytes, dummy); - free(dummy); - scratchBytes = Nbytes; + if (Nscratch < mesh.Nelements*mesh.Np) { + memory dummy(mesh.Nelements*mesh.Np,0); + o_transferScratch = elliptic.platform.malloc(dummy); + Nscratch = mesh.Nelements*mesh.Np; } } void MGLevel::Report() { - hlong hNrows = (hlong) Nrows; - - dlong minNrows=0, maxNrows=0; - hlong totalNrows=0; - dfloat avgNrows; + int totalActive=(Nrows>0) ? 1:0; + mesh.comm.Allreduce(totalActive); - MPI_Allreduce(&Nrows, &maxNrows, 1, MPI_DLONG, MPI_MAX, mesh.comm); - MPI_Allreduce(&hNrows, &totalNrows, 1, MPI_HLONG, MPI_SUM, mesh.comm); - avgNrows = (dfloat) totalNrows/mesh.size; + dlong minNrows=Nrows, maxNrows=Nrows; + hlong totalNrows=Nrows; + mesh.comm.Allreduce(maxNrows, Comm::Max); + mesh.comm.Allreduce(totalNrows, Comm::Sum); + dfloat avgNrows = static_cast(totalNrows)/totalActive; if (Nrows==0) Nrows=maxNrows; //set this so it's ignored for the global min - MPI_Allreduce(&Nrows, &minNrows, 1, MPI_DLONG, MPI_MIN, mesh.comm); + mesh.comm.Allreduce(minNrows, Comm::Min); char smootherString[BUFSIZ]; if (stype==JACOBI) @@ -331,24 +342,18 @@ void MGLevel::Report() { } } -MGLevel::~MGLevel() { - coarsenKernel.free(); - partialCoarsenKernel.free(); - prolongateKernel.free(); - partialProlongateKernel.free(); -} - void MGLevel::SetupSmoother() { //set up the fine problem smoothing - dfloat *diagA = (dfloat*) calloc(Nrows, sizeof(dfloat)); - dfloat *invDiagA = (dfloat*) calloc(Nrows, sizeof(dfloat)); + memory diagA (Nrows); + memory invDiagA(Nrows); elliptic.BuildOperatorDiagonal(diagA); - for (dlong n=0;n(invDiagA); if (elliptic.settings.compareSetting("MULTIGRID SMOOTHER","CHEBYSHEV")) { stype = CHEBYSHEV; @@ -376,8 +381,6 @@ void MGLevel::SetupSmoother() { //update diagonal with weight o_invDiagA.copyFrom(invDiagA); } - free(diagA); - free(invDiagA); } @@ -392,28 +395,28 @@ dfloat MGLevel::maxEigSmoothAx(){ const dlong N = Nrows; const dlong M = Ncols; + linAlg_t& linAlg = platform.linAlg(); + int k = 10; - hlong Nlocal = (hlong) Nrows; - hlong Ntotal = 0; - MPI_Allreduce(&Nlocal, &Ntotal, 1, MPI_HLONG, MPI_SUM, mesh.comm); + hlong Ntotal = Nrows; + mesh.comm.Allreduce(Ntotal); if(k > Ntotal) k = static_cast(Ntotal); // do an arnoldi // allocate memory for Hessenberg matrix - double *H = (double *) calloc(k*k,sizeof(double)); + memory H(k*k,0.0); // allocate memory for basis - dfloat *Vx = (dfloat*) calloc(M, sizeof(dfloat)); - // occa::memory *o_V = (occa::memory *) calloc(k+1, sizeof(occa::memory)); - occa::memory *o_V = new occa::memory[k+1]; + memory Vx(M); + memory> o_V(k+1); - occa::memory o_Vx = elliptic.platform.malloc(M*sizeof(dfloat),Vx); - occa::memory o_AVx = elliptic.platform.malloc(M*sizeof(dfloat),Vx); + deviceMemory o_Vx = elliptic.platform.malloc(Vx); + deviceMemory o_AVx = elliptic.platform.malloc(Vx); for(int i=0; i<=k; i++) - o_V[i] = elliptic.platform.malloc(M*sizeof(dfloat),Vx); + o_V[i] = elliptic.platform.malloc(Vx); // generate a random vector for initial basis vector for (dlong i=0;i(hij); } if(j+1 < k){ @@ -445,14 +448,14 @@ dfloat MGLevel::maxEigSmoothAx(){ dfloat norm_vj = linAlg.norm2(N, o_V[j+1], mesh.comm); linAlg.scale(N, 1.0/norm_vj, o_V[j+1]); - H[j+1+ j*k] = (double) norm_vj; + H[j+1+ j*k] = static_cast(norm_vj); } } - double *WR = (double *) malloc(k*sizeof(double)); - double *WI = (double *) malloc(k*sizeof(double)); + memory WR(k); + memory WI(k); - matrixEigenValues(k, H, WR, WI); + linAlg_t::matrixEigenValues(k, H, WR, WI); double rho = 0.; @@ -464,17 +467,6 @@ dfloat MGLevel::maxEigSmoothAx(){ } } - // free memory - free(H); - free(WR); - free(WI); - - free(Vx); - o_Vx.free(); - o_AVx.free(); - for(int i=0; i<=k; i++) o_V[i].free(); - delete[] o_V; - // if((mesh.rank==0)) printf("weight = %g \n", rho); return rho; diff --git a/solvers/elliptic/src/ellipticPreconOAS.cpp b/solvers/elliptic/src/ellipticPreconOAS.cpp index 8854105c2..52a161950 100644 --- a/solvers/elliptic/src/ellipticPreconOAS.cpp +++ b/solvers/elliptic/src/ellipticPreconOAS.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -30,7 +30,7 @@ SOFTWARE. // entire local mesh + 1 ring overlap, solved with a local multigrid // precon and coarse problem consisting of the global degree 1 // problem, solved with parAlmond -void OASPrecon::Operator(occa::memory& o_r, occa::memory& o_Mr) { +void OASPrecon::Operator(deviceMemory& o_r, deviceMemory& o_Mr) { if (mesh.N>1) { if (elliptic.disc_c0) { @@ -40,45 +40,47 @@ void OASPrecon::Operator(occa::memory& o_r, occa::memory& o_Mr) { // to a single operation, but currently theres no easy way // as the ordering of globalDofs between the original mesh // partition and the ring mesh could be different - elliptic.ogsMasked->Scatter(o_rPatchL, o_r, ogs_dfloat, ogs_add, ogs_notrans); - mesh.ringHalo->Exchange(o_rPatchL, mesh.Np, ogs_dfloat); - ellipticPatch->ogsMasked->Gather(o_rPatch, o_rPatchL, ogs_dfloat, ogs_add, ogs_notrans); + elliptic.ogsMasked.Scatter(o_rPatchL, o_r, 1, ogs::NoTrans); + mesh.ringHalo.Exchange(o_rPatchL, mesh.Np); + ellipticPatch.ogsMasked.Gather(o_rPatch, o_rPatchL, 1, ogs::Add, ogs::NoTrans); } else { - o_rPatch.copyFrom(o_r, elliptic.Ndofs*sizeof(dfloat)); - mesh.ringHalo->Exchange(o_rPatch, mesh.Np, ogs_dfloat); + o_rPatch.copyFrom(o_r, elliptic.Ndofs); + mesh.ringHalo.Exchange(o_rPatch, mesh.Np); } //Apply local patch precon - preconPatch->Operator(o_rPatch, o_zPatch); + preconPatch.Operator(o_rPatch, o_zPatch); //Coarsen problem to N=1 and pass to parAlmond // TODO: This is blocking due to H<->D transfers. // Should modify precons so size=1 is non-blocking - level->coarsen(o_r, o_rC); + level.coarsen(o_r, o_rC); parAlmond.Operator(o_rC, o_zC); + linAlg_t& linAlg = elliptic.platform.linAlg(); + //Add contributions from all patches together if (elliptic.disc_c0) { dlong Ntotal=mesh.Nelements*mesh.Np; - ellipticPatch->ogsMasked->Scatter(o_zPatchL, o_zPatch, ogs_dfloat, ogs_add, ogs_notrans); - ogsMaskedRing->GatherScatter(o_zPatchL, ogs_dfloat, ogs_add, ogs_sym); + ellipticPatch.ogsMasked.Scatter(o_zPatchL, o_zPatch, 1, ogs::NoTrans); + ogsMaskedRing.GatherScatter(o_zPatchL, 1, ogs::Add, ogs::Sym); // Weight by overlap degree, zPatch = patchWeight*zPatch - elliptic.linAlg.amx(Ntotal, 1.0, o_patchWeight, o_zPatchL); + linAlg.amx(Ntotal, 1.0, o_patchWeight, o_zPatchL); - elliptic.ogsMasked->Gather(o_Mr, o_zPatchL, ogs_dfloat, ogs_add, ogs_notrans); + elliptic.ogsMasked.Gather(o_Mr, o_zPatchL, 1, ogs::Add, ogs::NoTrans); } else { - mesh.ringHalo->Combine(o_zPatch, mesh.Np, ogs_dfloat); + mesh.ringHalo.Combine(o_zPatch, mesh.Np); // Weight by overlap degree, Mr = patchWeight*zPatch - elliptic.linAlg.amxpy(elliptic.Ndofs, 1.0, o_patchWeight, o_zPatch, 0.0, o_Mr); + linAlg.amxpy(elliptic.Ndofs, 1.0, o_patchWeight, o_zPatch, 0.0, o_Mr); } // Add prologatated coarse solution - level->prolongate(o_zC, o_Mr); + level.prolongate(o_zC, o_Mr); } else { //if N=1 just call the coarse solver parAlmond.Operator(o_r, o_Mr); @@ -94,78 +96,86 @@ OASPrecon::OASPrecon(elliptic_t& _elliptic): //build the one ring mesh if (mesh.N>1) { + if (Comm::World().rank()==0){ + printf("-----------------------------Multigrid Degree %2d Patch--------------------------------------\n", mesh.N); + } meshPatch = mesh.SetupRingPatch(); - ellipticPatch = elliptic.SetupRingPatch(*meshPatch); - preconPatch = new MultiGridPrecon(*ellipticPatch); + ellipticPatch = elliptic.SetupRingPatch(meshPatch); + preconPatch.Setup(ellipticPatch); if (settings.compareSetting("DISCRETIZATION", "CONTINUOUS")) { - rPatchL = (dfloat*) calloc(mesh.Np*(mesh.Nelements+mesh.totalRingElements),sizeof(dfloat)); - zPatchL = (dfloat*) calloc(mesh.Np*(mesh.Nelements+mesh.totalRingElements),sizeof(dfloat)); + rPatchL.malloc(mesh.Np*(mesh.Nelements+mesh.totalRingElements),0.0); + zPatchL.malloc(mesh.Np*(mesh.Nelements+mesh.totalRingElements),0.0); - o_rPatchL = elliptic.platform.malloc(mesh.Np*(mesh.Nelements+mesh.totalRingElements)*sizeof(dfloat), rPatchL); - o_zPatchL = elliptic.platform.malloc(mesh.Np*(mesh.Nelements+mesh.totalRingElements)*sizeof(dfloat), zPatchL); + o_rPatchL = elliptic.platform.malloc(rPatchL); + o_zPatchL = elliptic.platform.malloc(zPatchL); } - rPatch = (dfloat*) calloc(ellipticPatch->Ndofs,sizeof(dfloat)); - zPatch = (dfloat*) calloc(ellipticPatch->Ndofs,sizeof(dfloat)); - o_rPatch = elliptic.platform.malloc(ellipticPatch->Ndofs*sizeof(dfloat), rPatch); - o_zPatch = elliptic.platform.malloc(ellipticPatch->Ndofs*sizeof(dfloat), zPatch); + rPatch.malloc(ellipticPatch.Ndofs,0.0); + zPatch.malloc(ellipticPatch.Ndofs,0.0); + o_rPatch = elliptic.platform.malloc(rPatch); + o_zPatch = elliptic.platform.malloc(zPatch); //compute patch overlap weighting - patchWeight = (dfloat*) malloc(meshPatch->Nelements*meshPatch->Np*sizeof(dfloat)); - for (int i=0;iNelements*meshPatch->Np;i++) + patchWeight.malloc(meshPatch.Nelements*meshPatch.Np); + for (int i=0;iNelements*meshPatch->Np,sizeof(hlong)); - memcpy(maskedRingGlobalIds, elliptic.maskedGlobalIds, mesh.Nelements*mesh.Np*sizeof(hlong)); - mesh.ringHalo->Exchange(maskedRingGlobalIds, mesh.Np, ogs_hlong); + memory maskedRingGlobalIds(meshPatch.Nelements*meshPatch.Np); + maskedRingGlobalIds.copyFrom(elliptic.maskedGlobalIds, mesh.Nelements*mesh.Np); + mesh.ringHalo.Exchange(maskedRingGlobalIds, mesh.Np); //mask ring - for (dlong n=0;nNmasked;n++) - maskedRingGlobalIds[ellipticPatch->maskIds[n]] = 0; + for (dlong n=0;nNelements*meshPatch->Np, maskedRingGlobalIds, - mesh.comm, verbose, elliptic.platform); - free(maskedRingGlobalIds); + bool unique = true; //flag a unique node in every gather node + ogsMaskedRing.Setup(meshPatch.Nelements*meshPatch.Np, + maskedRingGlobalIds, mesh.comm, + ogs::Signed, ogs::Auto, + unique, verbose, elliptic.platform); //determine overlap of each node with masked ogs - ogsMaskedRing->GatherScatter(patchWeight, ogs_dfloat, ogs_add, ogs_sym); + ogsMaskedRing.GatherScatter(patchWeight, 1, ogs::Add, ogs::Sym); } else { //determine overlap by combining halos - mesh.ringHalo->Combine(patchWeight, mesh.Np, ogs_dfloat); + mesh.ringHalo.Combine(patchWeight, mesh.Np); } //invert - for (int i=0;iNelements*meshPatch->Np;i++) + for (int i=0;i 0.0) ? 1.0/patchWeight[i] : 0.0; - o_patchWeight = elliptic.platform.malloc(meshPatch->Nelements*meshPatch->Np*sizeof(dfloat), patchWeight); + o_patchWeight = elliptic.platform.malloc(patchWeight); } //build the coarse precon int Nc = 1; //hard code int NpCoarse = mesh.Np; switch(mesh.elementType){ - case TRIANGLES: + case Mesh::TRIANGLES: NpCoarse = ((Nc+1)*(Nc+2))/2; break; - case QUADRILATERALS: + case Mesh::QUADRILATERALS: NpCoarse = (Nc+1)*(Nc+1); break; - case TETRAHEDRA: + case Mesh::TETRAHEDRA: NpCoarse = ((Nc+1)*(Nc+2)*(Nc+3))/6; break; - case HEXAHEDRA: + case Mesh::HEXAHEDRA: NpCoarse = (Nc+1)*(Nc+1)*(Nc+1); break; } //build mesh and elliptic objects for this degree - mesh_t &meshC = mesh.SetupNewDegree(Nc); - elliptic_t &ellipticC = elliptic.SetupNewDegree(meshC); + mesh_t meshC = mesh.SetupNewDegree(Nc); + elliptic_t ellipticC = elliptic.SetupNewDegree(meshC); //build full A matrix and pass to parAlmond + if (Comm::World().rank()==0){ + printf("-----------------------------Multigrid AMG Setup--------------------------------------------\n"); + } parAlmond::parCOO A(elliptic.platform, meshC.comm); if (settings.compareSetting("DISCRETIZATION", "IPDG")) ellipticC.BuildOperatorMatrixIpdg(A); @@ -177,8 +187,10 @@ OASPrecon::OASPrecon(elliptic_t& _elliptic): int size = meshC.size; hlong TotalRows = A.globalRowStarts[size]; dlong numLocalRows = (dlong) (A.globalRowStarts[rank+1]-A.globalRowStarts[rank]); - dfloat *null = (dfloat *) malloc(numLocalRows*sizeof(dfloat)); - for (dlong i=0;i null(numLocalRows); + for (dlong i=0;iNgather; - Ncols = Nrows + elliptic.ogsMasked->NgatherHalo; + Nrows = elliptic.ogsMasked.Ngather; + Ncols = Nrows + elliptic.gHalo.Nhalo; } else { Nrows = mesh.Nelements*mesh.Np; Ncols = Nrows + mesh.totalHaloPairs*mesh.Np; } - level = new MGLevel(elliptic, Nrows, Ncols, Nc, NpCoarse); - level->meshC = &meshC; - level->ogsMaskedC = ellipticC.ogsMasked; + level = MGLevel(elliptic, Nrows, Ncols, Nc, NpCoarse); + level.meshC = meshC; + level.ellipticC = ellipticC; //coarse buffers Ncols = parAlmond.getNumCols(0); - rC = (dfloat*) calloc(Ncols,sizeof(dfloat)); - zC = (dfloat*) calloc(Ncols,sizeof(dfloat)); - o_rC = elliptic.platform.malloc(Ncols*sizeof(dfloat), rC); - o_zC = elliptic.platform.malloc(Ncols*sizeof(dfloat), zC); + rC.malloc(Ncols,0.0); + zC.malloc(Ncols,0.0); + o_rC = elliptic.platform.malloc(rC); + o_zC = elliptic.platform.malloc(zC); } //report parAlmond.Report(); } - -OASPrecon::~OASPrecon() { - if (mesh.N>1) { - delete preconPatch; - if (mesh.size>1) delete ellipticPatch; - if (mesh.size>1) delete meshPatch; - - delete &(level->elliptic); - if (level->mesh.ogs) level->mesh.ogs->Free(); - delete level; - } -} \ No newline at end of file diff --git a/solvers/elliptic/src/ellipticPreconParAlmond.cpp b/solvers/elliptic/src/ellipticPreconParAlmond.cpp index 298e9af70..5b9bf3768 100644 --- a/solvers/elliptic/src/ellipticPreconParAlmond.cpp +++ b/solvers/elliptic/src/ellipticPreconParAlmond.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,7 +27,7 @@ SOFTWARE. #include "ellipticPrecon.hpp" //AMG preconditioner via parAlmond -void ParAlmondPrecon::Operator(occa::memory& o_r, occa::memory& o_Mr) { +void ParAlmondPrecon::Operator(deviceMemory& o_r, deviceMemory& o_Mr) { //hand off to parAlmond parAlmond.Operator(o_r, o_Mr); @@ -41,6 +41,9 @@ ParAlmondPrecon::ParAlmondPrecon(elliptic_t& _elliptic): parAlmond(elliptic.platform, settings, elliptic.mesh.comm) { //build full A matrix and pass to parAlmond + if (Comm::World().rank()==0){ + printf("-----------------------------Multigrid AMG Setup--------------------------------------------\n"); + } parAlmond::parCOO A(elliptic.platform, elliptic.mesh.comm); if (settings.compareSetting("DISCRETIZATION", "IPDG")) { elliptic.BuildOperatorMatrixIpdg(A); @@ -52,12 +55,13 @@ ParAlmondPrecon::ParAlmondPrecon(elliptic_t& _elliptic): int rank = elliptic.mesh.rank; int size = elliptic.mesh.size; hlong TotalRows = A.globalRowStarts[size]; - dlong numLocalRows = (dlong) (A.globalRowStarts[rank+1]-A.globalRowStarts[rank]); - dfloat *null = (dfloat *) malloc(numLocalRows*sizeof(dfloat)); - for (dlong i=0;i(A.globalRowStarts[rank+1]-A.globalRowStarts[rank]); + memory null(numLocalRows); + for (dlong i=0;i& o_r, deviceMemory& o_Mr) { - if (mesh.elementType==TRIANGLES) { + linAlg_t& linAlg = elliptic.platform.linAlg(); + + if (mesh.elementType==Mesh::TRIANGLES) { // Mr = invDegree.*r - elliptic.linAlg.amxpy(elliptic.Ndofs, 1.0, elliptic.o_weightG, o_r, 0.0, o_Mr); + linAlg.amxpy(elliptic.Ndofs, 1.0, elliptic.o_weightG, o_r, 0.0, o_Mr); - elliptic.ogsMasked->Scatter(o_MrL, o_Mr, ogs_dfloat, ogs_add, ogs_notrans); + elliptic.ogsMasked.Scatter(o_MrL, o_Mr, 1, ogs::NoTrans); SEMFEMInterpKernel(mesh.Nelements, mesh.o_SEMFEMAnterp, o_MrL, o_rFEM); - FEMogs->Gather(o_GrFEM, o_rFEM, ogs_dfloat, ogs_add, ogs_trans); + FEMogs.Gather(o_GrFEM, o_rFEM, 1, ogs::Add, ogs::Trans); parAlmond.Operator(o_GrFEM, o_GzFEM); - FEMogs->Scatter(o_zFEM, o_GzFEM, ogs_dfloat, ogs_add, ogs_notrans); + FEMogs.Scatter(o_zFEM, o_GzFEM, 1, ogs::NoTrans); SEMFEMAnterpKernel(mesh.Nelements, mesh.o_SEMFEMAnterp, o_zFEM, o_MrL); - elliptic.ogsMasked->Gather(o_Mr, o_MrL, ogs_dfloat, ogs_add, ogs_trans); + elliptic.ogsMasked.Gather(o_Mr, o_MrL, 1, ogs::Add, ogs::Trans); // Mr = invDegree.*Mr - elliptic.linAlg.amx(elliptic.Ndofs, 1.0, elliptic.o_weightG, o_Mr); + linAlg.amx(elliptic.Ndofs, 1.0, elliptic.o_weightG, o_Mr); } else { //pass to parAlmond @@ -61,49 +63,30 @@ SEMFEMPrecon::SEMFEMPrecon(elliptic_t& _elliptic): parAlmond(elliptic.platform, settings, mesh.comm) { //sanity checking - if (!settings.compareSetting("DISCRETIZATION", "CONTINUOUS") ) - LIBP_ABORT(string("SEMFEM is supported for CONTINUOUS only")); + LIBP_ABORT("SEMFEM is supported for CONTINUOUS only", + !settings.compareSetting("DISCRETIZATION", "CONTINUOUS")); //make a low-order fem mesh from the sem mesh (also return globalIds of the enriched sem nodes, and faceNode mapping) - int Nfp = 0; - int *faceNodes = NULL; - hlong *globalIds = NULL; - femMesh = mesh.SetupSEMFEM(&globalIds, &Nfp, &faceNodes); + memory globalIds; + memory mapB; + femMesh = mesh.SetupSEMFEM(globalIds, mapB); //use the BCs to make a maskedGlobalIds array dlong Ntotal = mesh.NpFEM*mesh.Nelements; - hlong* maskedGlobalIds = (hlong *) calloc(Ntotal,sizeof(hlong)); - memcpy(maskedGlobalIds, globalIds, Ntotal*sizeof(hlong)); - if (mesh.elementType==TRIANGLES) { //build a new mask for NpFEM>Np node sets - // gather-scatter - int verbose = 0; - ogs_t *ogs = ogs_t::Setup(Ntotal, globalIds, mesh.comm, verbose, elliptic.platform); - - //make a node-wise bc flag using the gsop (prioritize Dirichlet boundaries over Neumann) - const int largeNumber = 1<<20; - int *mapB = (int *) calloc(Ntotal,sizeof(int)); - for (dlong e=0;e0) { - for (int n=0;n maskedGlobalIds(Ntotal); + maskedGlobalIds.copyFrom(globalIds, Ntotal); + + if (mesh.elementType==Mesh::TRIANGLES) { //build a new mask for NpFEM>Np node sets + //translate the node-wise bc flag + for (int n=0;n0) { + int BC = elliptic.BCType[bc]; //translate mesh's boundary flag + mapB[n] = BC; //record it + + if (mapB[n] == 1) maskedGlobalIds[n] = 0; //Dirichlet boundary } } - ogs->GatherScatter(mapB, ogs_int, ogs_min, ogs_sym); - - //use the bc flags to find masked ids - for (dlong n=0;nFree(); } else { //mask using the original mask for (dlong n=0;nNelements*femMesh->Nverts,sizeof(dlong)); + memory localIds(femMesh.Nelements*femMesh.Nverts); for(dlong e=0;eNverts]; + dlong id[femMesh.Nverts]; //local ids in the subelement fem grid for (int i=0;iNverts+i]; + id[i] = e*mesh.NpFEM + mesh.FEMEToV[n*femMesh.Nverts+i]; - dlong femId = e*mesh.NelFEM*femMesh->Nverts+n*mesh.Nverts; + dlong femId = e*mesh.NelFEM*femMesh.Nverts+n*mesh.Nverts; switch(mesh.elementType){ - case TRIANGLES: + case Mesh::TRIANGLES: localIds[femId+0] = id[0]; localIds[femId+1] = id[1]; localIds[femId+2] = id[2]; break; - case QUADRILATERALS: + case Mesh::QUADRILATERALS: localIds[femId+0] = id[0]; localIds[femId+1] = id[1]; localIds[femId+2] = id[3]; //need to swap this as the Np nodes are ordered [0,1,3,2] in a degree 1 element localIds[femId+3] = id[2]; break; - case TETRAHEDRA: + case Mesh::TETRAHEDRA: localIds[femId+0] = id[0]; localIds[femId+1] = id[1]; localIds[femId+2] = id[2]; localIds[femId+3] = id[3]; break; - case HEXAHEDRA: + case Mesh::HEXAHEDRA: localIds[femId+0] = id[0]; localIds[femId+1] = id[1]; localIds[femId+2] = id[3]; //need to swap this as the Np nodes are ordered [0,1,3,2,4,5,7,6] in a degree 1 element @@ -159,99 +144,98 @@ SEMFEMPrecon::SEMFEMPrecon(elliptic_t& _elliptic): } //make a fem elliptic solver - femElliptic = new elliptic_t(elliptic.platform, *femMesh, - elliptic.settings, elliptic.lambda); - femElliptic->ogsMasked = FEMogs; //only for getting Ngather when building matrix + femElliptic.platform = elliptic.platform; + femElliptic.mesh = femMesh; + femElliptic.settings = elliptic.settings; + femElliptic.lambda = elliptic.lambda; + + femElliptic.ogsMasked = FEMogs; //only for getting Ngather when building matrix // number of degrees of freedom on this rank (after gathering) - hlong Ngather = FEMogs->Ngather; + hlong Ngather = FEMogs.Ngather; // create a global numbering system - hlong *globalIds2 = (hlong *) calloc(Ngather,sizeof(hlong)); + memory globalIds2(Ngather); // every gathered degree of freedom has its own global id - hlong *globalStarts = (hlong *) calloc(mesh.size+1,sizeof(hlong)); - MPI_Allgather(&Ngather, 1, MPI_HLONG, globalStarts+1, 1, MPI_HLONG, mesh.comm); - for(int r=0;rNgather;n++) { - globalIds2[n] = n + globalStarts[mesh.rank]; + for (dlong n =0;nScatter(maskedGlobalNumbering, globalIds2, ogs_hlong, ogs_add, ogs_notrans); - free(globalIds2); + memory maskedGlobalNumbering(Ntotal, -1); + FEMogs.Scatter(maskedGlobalNumbering, globalIds2, 1, ogs::NoTrans); //transfer the consecutive global numbering to the fem mesh - Ntotal = femMesh->Np*femMesh->Nelements; - femElliptic->maskedGlobalNumbering = (hlong *) calloc(Ntotal,sizeof(hlong)); + Ntotal = femMesh.Np*femMesh.Nelements; + femElliptic.maskedGlobalNumbering.malloc(Ntotal); - for (dlong e=0;eNelements;e++) { - for (int n=0;nNp;n++) { - dlong id = e*femMesh->Np + n; + for (dlong e=0;emaskedGlobalNumbering[id] = maskedGlobalNumbering[localId]; + femElliptic.maskedGlobalNumbering[id] = maskedGlobalNumbering[localId]; } } - free(localIds); free(maskedGlobalNumbering); //finally, build the fem matrix and pass to parAlmond - parAlmond::parCOO A(elliptic.platform, femMesh->comm); - femElliptic->BuildOperatorMatrixContinuous(A); + if (mesh.rank==0){ + printf("-----------------------------Multigrid AMG Setup--------------------------------------------\n"); + } + parAlmond::parCOO A(elliptic.platform, femMesh.comm); + femElliptic.BuildOperatorMatrixContinuous(A); //populate null space unit vector - int rank = femMesh->rank; - int size = femMesh->size; + int rank = femMesh.rank; + int size = femMesh.size; hlong TotalRows = A.globalRowStarts[size]; - dlong numLocalRows = (dlong) (A.globalRowStarts[rank+1]-A.globalRowStarts[rank]); - dfloat *null = (dfloat *) malloc(numLocalRows*sizeof(dfloat)); - for (dlong i=0;i(A.globalRowStarts[rank+1]-A.globalRowStarts[rank]); + + memory null(numLocalRows); + for (dlong i=0;i SEMFEMAnterp(mesh.NpFEM*mesh.Np); for(int n=0;n(mesh.SEMFEMInterp); + mesh.o_SEMFEMAnterp = elliptic.platform.malloc(SEMFEMAnterp); - dfloat *dummy = (dfloat*) calloc(mesh.Nelements*mesh.NpFEM,sizeof(dfloat)); //need this to avoid uninitialized memory warnings - o_rFEM = elliptic.platform.malloc(mesh.Nelements*mesh.NpFEM*sizeof(dfloat), dummy); - o_zFEM = elliptic.platform.malloc(mesh.Nelements*mesh.NpFEM*sizeof(dfloat), dummy); - free(dummy); + memory dummy(mesh.Nelements*mesh.NpFEM,0.0); //need this to avoid uninitialized memory warnings + o_rFEM = elliptic.platform.malloc(dummy); + o_zFEM = elliptic.platform.malloc(dummy); dlong Ncols = parAlmond.getNumCols(0); - dummy = (dfloat*) calloc(Ncols,sizeof(dfloat)); - o_GrFEM = elliptic.platform.malloc(Ncols*sizeof(dfloat),dummy); - o_GzFEM = elliptic.platform.malloc(Ncols*sizeof(dfloat),dummy); - free(dummy); + dummy.malloc(Ncols,0.0); + o_GrFEM = elliptic.platform.malloc(dummy); + o_GzFEM = elliptic.platform.malloc(dummy); - o_MrL = elliptic.platform.malloc(mesh.Np*mesh.Nelements*sizeof(dfloat)); + o_MrL = elliptic.platform.malloc(mesh.Np*mesh.Nelements); //build kernels - occa::properties kernelInfo = mesh.props; + properties_t kernelInfo = mesh.props; kernelInfo["defines/" "p_Np"]= mesh.Np; kernelInfo["defines/" "p_NpFEM"]= mesh.NpFEM; - int NblockV = 512/mesh.NpFEM; + int NblockV = std::max(256/mesh.NpFEM, 1); kernelInfo["defines/" "p_NblockV"]= NblockV; SEMFEMInterpKernel = elliptic.platform.buildKernel(DELLIPTIC "/okl/ellipticSEMFEMInterp.okl", @@ -261,12 +245,3 @@ SEMFEMPrecon::SEMFEMPrecon(elliptic_t& _elliptic): "ellipticSEMFEMAnterp", kernelInfo); } } - -SEMFEMPrecon::~SEMFEMPrecon() { - femElliptic->ogsMasked->Free(); - - femMesh->halo->Free(); - - SEMFEMInterpKernel.free(); - SEMFEMAnterpKernel.free(); -} \ No newline at end of file diff --git a/solvers/elliptic/src/ellipticRun.cpp b/solvers/elliptic/src/ellipticRun.cpp index bc48c79bc..ef9f6015a 100644 --- a/solvers/elliptic/src/ellipticRun.cpp +++ b/solvers/elliptic/src/ellipticRun.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,108 +25,119 @@ SOFTWARE. */ #include "elliptic.hpp" +#include "timer.hpp" void elliptic_t::Run(){ //setup linear solver hlong NglobalDofs; if (settings.compareSetting("DISCRETIZATION", "CONTINUOUS")) { - NglobalDofs = ogsMasked->NgatherGlobal*Nfields; + NglobalDofs = ogsMasked.NgatherGlobal*Nfields; } else { NglobalDofs = mesh.NelementsGlobal*mesh.Np*Nfields; } - linearSolver_t *linearSolver = linearSolver_t::Setup(Ndofs, Nhalo, - platform, settings, mesh.comm); - occa::properties kernelInfo = mesh.props; //copy base occa properties + linearSolver_t linearSolver; + if (settings.compareSetting("LINEAR SOLVER","NBPCG")){ + linearSolver.Setup(Ndofs, Nhalo, platform, settings, comm); + } else if (settings.compareSetting("LINEAR SOLVER","NBFPCG")){ + linearSolver.Setup(Ndofs, Nhalo, platform, settings, comm); + } else if (settings.compareSetting("LINEAR SOLVER","PCG")){ + linearSolver.Setup(Ndofs, Nhalo, platform, settings, comm); + } else if (settings.compareSetting("LINEAR SOLVER","PGMRES")){ + linearSolver.Setup(Ndofs, Nhalo, platform, settings, comm); + } else if (settings.compareSetting("LINEAR SOLVER","PMINRES")){ + linearSolver.Setup(Ndofs, Nhalo, platform, settings, comm); + } + + properties_t kernelInfo = mesh.props; //copy base occa properties - string dataFileName; + std::string dataFileName; settings.getSetting("DATA FILE", dataFileName); kernelInfo["includes"] += dataFileName; //add standard boundary functions - char *boundaryHeaderFileName; + std::string boundaryHeaderFileName; if (mesh.dim==2) - boundaryHeaderFileName = strdup(DELLIPTIC "/data/ellipticBoundary2D.h"); + boundaryHeaderFileName = std::string(DELLIPTIC "/data/ellipticBoundary2D.h"); else if (mesh.dim==3) - boundaryHeaderFileName = strdup(DELLIPTIC "/data/ellipticBoundary3D.h"); + boundaryHeaderFileName = std::string(DELLIPTIC "/data/ellipticBoundary3D.h"); kernelInfo["includes"] += boundaryHeaderFileName; - int Nmax = mymax(mesh.Np, mesh.Nfaces*mesh.Nfp); + int Nmax = std::max(mesh.Np, mesh.Nfaces*mesh.Nfp); kernelInfo["defines/" "p_Nmax"]= Nmax; kernelInfo["defines/" "p_Nfields"]= Nfields; // set kernel name suffix - char *suffix; - if(mesh.elementType==TRIANGLES) - suffix = strdup("Tri2D"); - if(mesh.elementType==QUADRILATERALS) - suffix = strdup("Quad2D"); - - if(mesh.elementType==TETRAHEDRA) - suffix = strdup("Tet3D"); - if(mesh.elementType==HEXAHEDRA) - suffix = strdup("Hex3D"); - - char fileName[BUFSIZ], kernelName[BUFSIZ]; - - if(mesh.elementType==QUADRILATERALS){ + std::string suffix; + if(mesh.elementType==Mesh::TRIANGLES) { + suffix = "Tri2D"; + } else if(mesh.elementType==Mesh::QUADRILATERALS) { if(mesh.dim==2) - suffix = strdup("Quad2D"); + suffix = "Quad2D"; else - suffix = strdup("Quad3D"); + suffix = "Quad3D"; + } else if(mesh.elementType==Mesh::TETRAHEDRA) { + suffix = "Tet3D"; + } else { //mesh.elementType==Mesh::HEXAHEDRA) + suffix = "Hex3D"; } - sprintf(fileName, DELLIPTIC "/okl/ellipticRhs%s.okl", suffix); - sprintf(kernelName, "ellipticRhs%s", suffix); - occa::kernel forcingKernel = platform.buildKernel(fileName, kernelName, + std::string oklFilePrefix = DELLIPTIC "/okl/"; + std::string oklFileSuffix = ".okl"; + + std::string fileName, kernelName; + + fileName = oklFilePrefix + "ellipticRhs" + suffix + oklFileSuffix; + kernelName = "ellipticRhs" + suffix; + kernel_t forcingKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - occa::kernel rhsBCKernel, addBCKernel; + kernel_t rhsBCKernel, addBCKernel; if (settings.compareSetting("DISCRETIZATION","IPDG")) { - sprintf(fileName, DELLIPTIC "/okl/ellipticRhsBCIpdg%s.okl", suffix); - sprintf(kernelName, "ellipticRhsBCIpdg%s", suffix); + fileName = oklFilePrefix + "ellipticRhsBCIpdg" + suffix + oklFileSuffix; + kernelName = "ellipticRhsBCIpdg" + suffix; rhsBCKernel = platform.buildKernel(fileName,kernelName, kernelInfo); } else if (settings.compareSetting("DISCRETIZATION","CONTINUOUS")) { - sprintf(fileName, DELLIPTIC "/okl/ellipticRhsBC%s.okl", suffix); - sprintf(kernelName, "ellipticRhsBC%s", suffix); + fileName = oklFilePrefix + "ellipticRhsBC" + suffix + oklFileSuffix; + kernelName = "ellipticRhsBC" + suffix; rhsBCKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(fileName, DELLIPTIC "/okl/ellipticAddBC%s.okl", suffix); - sprintf(kernelName, "ellipticAddBC%s", suffix); + fileName = oklFilePrefix + "ellipticAddBC" + suffix + oklFileSuffix; + kernelName = "ellipticAddBC" + suffix; addBCKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } //create occa buffers dlong Nall = mesh.Np*(mesh.Nelements+mesh.totalHaloPairs); - dfloat *rL = (dfloat*) calloc(Nall, sizeof(dfloat)); - dfloat *xL = (dfloat*) calloc(Nall, sizeof(dfloat)); - occa::memory o_rL = platform.malloc(Nall*sizeof(dfloat), rL); - occa::memory o_xL = platform.malloc(Nall*sizeof(dfloat), xL); + memory rL(Nall, 0.0); + memory xL(Nall, 0.0); + deviceMemory o_rL = platform.malloc(rL); + deviceMemory o_xL = platform.malloc(xL); - occa::memory o_r, o_x; + deviceMemory o_r, o_x; if (settings.compareSetting("DISCRETIZATION","IPDG")) { o_r = o_rL; o_x = o_xL; } else { - dlong Ng = ogsMasked->Ngather; - dlong Nghalo = ogsMasked->NgatherHalo; + dlong Ng = ogsMasked.Ngather; + dlong Nghalo = gHalo.Nhalo; dlong Ngall = Ng + Nghalo; - o_r = platform.malloc(Ngall*sizeof(dfloat)); - o_x = platform.malloc(Ngall*sizeof(dfloat)); + o_r = platform.malloc(Ngall); + o_x = platform.malloc(Ngall); } //storage for M*q during reporting - occa::memory o_MxL = platform.malloc(Nall*sizeof(dfloat), xL); + deviceMemory o_MxL = platform.malloc(xL); mesh.MassMatrixKernelSetup(Nfields); // mass matrix operator //populate rhs forcing forcingKernel(mesh.Nelements, - mesh.o_ggeo, + mesh.o_wJ, mesh.o_MM, mesh.o_x, mesh.o_y, @@ -151,6 +162,7 @@ void elliptic_t::Run(){ o_rL); } else if (settings.compareSetting("DISCRETIZATION","CONTINUOUS")) { rhsBCKernel(mesh.Nelements, + mesh.o_wJ, mesh.o_ggeo, mesh.o_sgeo, mesh.o_D, @@ -168,25 +180,23 @@ void elliptic_t::Run(){ // gather rhs to globalDofs if c0 if(settings.compareSetting("DISCRETIZATION","CONTINUOUS")){ - ogsMasked->Gather(o_r, o_rL, ogs_dfloat, ogs_add, ogs_trans); - ogsMasked->Gather(o_x, o_xL, ogs_dfloat, ogs_add, ogs_notrans); + ogsMasked.Gather(o_r, o_rL, 1, ogs::Add, ogs::Trans); + ogsMasked.Gather(o_x, o_xL, 1, ogs::Add, ogs::NoTrans); } int maxIter = 5000; int verbose = settings.compareSetting("VERBOSE", "TRUE") ? 1 : 0; - MPI_Barrier(mesh.comm); - double startTime = MPI_Wtime(); + timePoint_t start = GlobalPlatformTime(platform); //call the solver - dfloat tol = 1e-8; - int iter = Solve(*linearSolver, o_x, o_r, tol, maxIter, verbose); - + dfloat tol = (sizeof(dfloat)==sizeof(double)) ? 1.0e-8 : 1.0e-5; + int iter = Solve(linearSolver, o_x, o_r, tol, maxIter, verbose); //add the boundary data to the masked nodes if(settings.compareSetting("DISCRETIZATION","CONTINUOUS")){ // scatter x to LocalDofs if c0 - ogsMasked->Scatter(o_xL, o_x, ogs_dfloat, ogs_add, ogs_notrans); + ogsMasked.Scatter(o_xL, o_x, 1, ogs::NoTrans); //fill masked nodes with BC data addBCKernel(mesh.Nelements, mesh.o_x, @@ -196,9 +206,8 @@ void elliptic_t::Run(){ o_xL); } - MPI_Barrier(mesh.comm); - double endTime = MPI_Wtime(); - double elapsedTime = endTime - startTime; + timePoint_t end = GlobalPlatformTime(platform); + double elapsedTime = ElapsedTime(start, end); if ((mesh.rank==0) && verbose){ printf("%d, " hlongFormat ", %g, %d, %g, %g; global: N, dofs, elapsed, iterations, time per node, nodes*iterations/time %s\n", @@ -217,7 +226,7 @@ void elliptic_t::Run(){ o_xL.copyTo(xL); // output field files - string name; + std::string name; settings.getSetting("OUTPUT FILE NAME", name); char fname[BUFSIZ]; sprintf(fname, "%s_%04d.vtu", name.c_str(), mesh.rank); @@ -231,15 +240,9 @@ void elliptic_t::Run(){ mesh.MassMatrixApply(o_xL, o_MxL); dlong Nentries = mesh.Nelements*mesh.Np*Nfields; - dfloat norm2 = sqrt(linAlg.innerProd(Nentries, o_xL, o_MxL, mesh.comm)); + dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_xL, o_MxL, mesh.comm)); if(mesh.rank==0) printf("Solution norm = %17.15lg\n", norm2); } - - free(rL); free(xL); - o_rL.free(); o_xL.free(); - o_r.free(); o_x.free(); - o_MxL.free(); - delete linearSolver; } diff --git a/solvers/elliptic/src/ellipticSettings.cpp b/solvers/elliptic/src/ellipticSettings.cpp index 1078b5fc3..05080cc58 100644 --- a/solvers/elliptic/src/ellipticSettings.cpp +++ b/solvers/elliptic/src/ellipticSettings.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,7 +27,7 @@ SOFTWARE. #include "elliptic.hpp" //settings for elliptic solver -ellipticSettings_t::ellipticSettings_t(const MPI_Comm& _comm): +ellipticSettings_t::ellipticSettings_t(const comm_t& _comm): settings_t(_comm) { //common settings used when the elliptic solver @@ -55,7 +55,7 @@ void ellipticAddRunSettings(settings_t& settings) { } void ellipticAddSettings(settings_t& settings, - const string prefix) { + const std::string prefix) { settings.newSetting(prefix+"DISCRETIZATION", "CONTINUOUS", "Type of Finite Element Discretization", @@ -99,10 +99,7 @@ void ellipticAddSettings(settings_t& settings, void ellipticSettings_t::report() { - int rank; - MPI_Comm_rank(comm, &rank); - - if (rank==0) { + if (comm.rank()==0) { std::cout << "Elliptic Settings:\n\n"; reportSetting("DATA FILE"); @@ -129,15 +126,15 @@ void ellipticSettings_t::report() { void ellipticSettings_t::parseFromFile(platformSettings_t& platformSettings, meshSettings_t& meshSettings, - const string filename) { + const std::string filename) { //read all settings from file settings_t s(comm); s.readSettingsFromFile(filename); for(auto it = s.settings.begin(); it != s.settings.end(); ++it) { - setting_t* set = it->second; - const string name = set->getName(); - const string val = set->getVal(); + setting_t& set = it->second; + const std::string name = set.getName(); + const std::string val = set.getVal(); if (platformSettings.hasSetting(name)) platformSettings.changeSetting(name, val); else if (meshSettings.hasSetting(name)) @@ -145,9 +142,7 @@ void ellipticSettings_t::parseFromFile(platformSettings_t& platformSettings, else if (hasSetting(name)) //self changeSetting(name, val); else { - stringstream ss; - ss << "Unknown setting: [" << name << "] requested"; - LIBP_ABORT(ss.str()); + LIBP_FORCE_ABORT("Unknown setting: [" << name << "] requested"); } } } diff --git a/solvers/elliptic/src/ellipticSetup.cpp b/solvers/elliptic/src/ellipticSetup.cpp index 6e8de66d9..105ad63ed 100644 --- a/solvers/elliptic/src/ellipticSetup.cpp +++ b/solvers/elliptic/src/ellipticSetup.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,157 +27,154 @@ SOFTWARE. #include "elliptic.hpp" #include "ellipticPrecon.hpp" -elliptic_t& elliptic_t::Setup(platform_t& platform, mesh_t& mesh, - ellipticSettings_t& settings, dfloat lambda, - const int NBCTypes, const int *BCType){ +void elliptic_t::Setup(platform_t& _platform, mesh_t& _mesh, + settings_t& _settings, dfloat _lambda, + const int _NBCTypes, const memory _BCType){ - elliptic_t* elliptic = new elliptic_t(platform, mesh, settings, lambda); + platform = _platform; + mesh = _mesh; + comm = _mesh.comm; + settings = _settings; + lambda = _lambda; - elliptic->Nfields = 1; + Nfields = 1; - elliptic->disc_ipdg = settings.compareSetting("DISCRETIZATION","IPDG"); - elliptic->disc_c0 = settings.compareSetting("DISCRETIZATION","CONTINUOUS"); + //Trigger JIT kernel builds + ogs::InitializeKernels(platform, ogs::Dfloat, ogs::Add); + + disc_ipdg = settings.compareSetting("DISCRETIZATION","IPDG"); + disc_c0 = settings.compareSetting("DISCRETIZATION","CONTINUOUS"); //setup linear algebra module - platform.linAlg.InitKernels({"add", "sum", "scale", + platform.linAlg().InitKernels({"add", "sum", "scale", "axpy", "zaxpy", "amx", "amxpy", "zamxpy", "adx", "adxpy", "zadxpy", - "innerProd", "weightedInnerProd", - "norm2", "weightedNorm2"}); + "innerProd", "norm2"}); /*setup trace halo exchange */ - elliptic->traceHalo = mesh.HaloTraceSetup(elliptic->Nfields); + traceHalo = mesh.HaloTraceSetup(Nfields); // Boundary Type translation. Just defaults. - elliptic->BCType = (int*) calloc(NBCTypes,sizeof(int)); - memcpy(elliptic->BCType,BCType,NBCTypes*sizeof(int)); + NBCTypes = _NBCTypes; + BCType.malloc(NBCTypes); + BCType.copyFrom(_BCType); //setup boundary flags and make mask and masked ogs - elliptic->BoundarySetup(); + BoundarySetup(); if (settings.compareSetting("DISCRETIZATION","IPDG")) { //tau (penalty term in IPDG) - if (mesh.elementType==TRIANGLES || - mesh.elementType==QUADRILATERALS){ - elliptic->tau = 2.0*(mesh.N+1)*(mesh.N+2)/2.0; + if (mesh.elementType==Mesh::TRIANGLES || + mesh.elementType==Mesh::QUADRILATERALS){ + tau = 2.0*(mesh.N+1)*(mesh.N+2)/2.0; if(mesh.dim==3) - elliptic->tau *= 1.5; + tau *= 1.5; } else - elliptic->tau = 2.0*(mesh.N+1)*(mesh.N+3); + tau = 2.0*(mesh.N+1)*(mesh.N+3); //buffer for gradient dlong Ntotal = mesh.Np*(mesh.Nelements+mesh.totalHaloPairs); - elliptic->grad = (dfloat*) calloc(Ntotal*4, sizeof(dfloat)); - elliptic->o_grad = platform.malloc(Ntotal*4*sizeof(dfloat), elliptic->grad); + grad.malloc(Ntotal*4); + o_grad = platform.malloc(grad); } else { - elliptic->tau = 0.0; + tau = 0.0; //buffer for local Ax dlong Ntotal = mesh.Np*mesh.Nelements; - elliptic->o_AqL = platform.malloc(Ntotal*sizeof(dfloat)); + o_AqL = platform.malloc(Ntotal); } // OCCA build stuff - occa::properties kernelInfo = mesh.props; //copy base occa properties + properties_t kernelInfo = mesh.props; //copy base occa properties // set kernel name suffix - char *suffix; - if(mesh.elementType==TRIANGLES){ + std::string suffix; + if(mesh.elementType==Mesh::TRIANGLES){ if(mesh.dim==2) - suffix = strdup("Tri2D"); + suffix = "Tri2D"; else - suffix = strdup("Tri3D"); - } else if(mesh.elementType==QUADRILATERALS){ + suffix = "Tri3D"; + } else if(mesh.elementType==Mesh::QUADRILATERALS){ if(mesh.dim==2) - suffix = strdup("Quad2D"); + suffix = "Quad2D"; else - suffix = strdup("Quad3D"); - } else if(mesh.elementType==TETRAHEDRA) - suffix = strdup("Tet3D"); - else if(mesh.elementType==HEXAHEDRA) - suffix = strdup("Hex3D"); + suffix = "Quad3D"; + } else if(mesh.elementType==Mesh::TETRAHEDRA) + suffix = "Tet3D"; + else if(mesh.elementType==Mesh::HEXAHEDRA) + suffix = "Hex3D"; + + std::string oklFilePrefix = DELLIPTIC "/okl/"; + std::string oklFileSuffix = ".okl"; - char fileName[BUFSIZ], kernelName[BUFSIZ]; + std::string fileName, kernelName; //add standard boundary functions - char *boundaryHeaderFileName; + std::string boundaryHeaderFileName; if (mesh.dim==2) - boundaryHeaderFileName = strdup(DELLIPTIC "/data/ellipticBoundary2D.h"); + boundaryHeaderFileName = std::string(DELLIPTIC "/data/ellipticBoundary2D.h"); else if (mesh.dim==3) - boundaryHeaderFileName = strdup(DELLIPTIC "/data/ellipticBoundary3D.h"); + boundaryHeaderFileName = std::string(DELLIPTIC "/data/ellipticBoundary3D.h"); kernelInfo["includes"] += boundaryHeaderFileName; int blockMax = 256; if (platform.device.mode() == "CUDA") blockMax = 512; - int NblockV = mymax(1,blockMax/mesh.Np); + int NblockV = std::max(1,blockMax/mesh.Np); kernelInfo["defines/" "p_NblockV"]= NblockV; // Ax kernel if (settings.compareSetting("DISCRETIZATION","CONTINUOUS")) { - sprintf(fileName, DELLIPTIC "/okl/ellipticAx%s.okl", suffix); - if(mesh.elementType==HEXAHEDRA){ + fileName = oklFilePrefix + "ellipticAx" + suffix + oklFileSuffix; + if(mesh.elementType==Mesh::HEXAHEDRA){ if(mesh.settings.compareSetting("ELEMENT MAP", "TRILINEAR")) - sprintf(kernelName, "ellipticPartialAxTrilinear%s", suffix); + kernelName = "ellipticPartialAxTrilinear" + suffix; else - sprintf(kernelName, "ellipticPartialAx%s", suffix); + kernelName = "ellipticPartialAx" + suffix; } else{ - sprintf(kernelName, "ellipticPartialAx%s", suffix); + kernelName = "ellipticPartialAx" + suffix; } - elliptic->partialAxKernel = platform.buildKernel(fileName, kernelName, - kernelInfo); + partialAxKernel = platform.buildKernel(fileName, kernelName, + kernelInfo); } else if (settings.compareSetting("DISCRETIZATION","IPDG")) { - int Nmax = mymax(mesh.Np, mesh.Nfaces*mesh.Nfp); + int Nmax = std::max(mesh.Np, mesh.Nfaces*mesh.Nfp); kernelInfo["defines/" "p_Nmax"]= Nmax; - sprintf(fileName, DELLIPTIC "/okl/ellipticGradient%s.okl", suffix); - sprintf(kernelName, "ellipticPartialGradient%s", suffix); - elliptic->partialGradientKernel = platform.buildKernel(fileName, kernelName, + fileName = oklFilePrefix + "ellipticGradient" + suffix + oklFileSuffix; + kernelName = "ellipticPartialGradient" + suffix; + partialGradientKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(fileName, DELLIPTIC "/okl/ellipticAxIpdg%s.okl", suffix); - sprintf(kernelName, "ellipticPartialAxIpdg%s", suffix); - elliptic->partialIpdgKernel = platform.buildKernel(fileName, kernelName, + fileName = oklFilePrefix + "ellipticAxIpdg" + suffix + oklFileSuffix; + kernelName = "ellipticPartialAxIpdg" + suffix; + partialIpdgKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } /* Preconditioner Setup */ if (settings.compareSetting("DISCRETIZATION", "CONTINUOUS")) { - elliptic->Ndofs = elliptic->ogsMasked->Ngather*elliptic->Nfields; - elliptic->Nhalo = elliptic->ogsMasked->NgatherHalo*elliptic->Nfields; + Ndofs = ogsMasked.Ngather*Nfields; + Nhalo = gHalo.Nhalo*Nfields; } else { - elliptic->Ndofs = mesh.Nelements*mesh.Np*elliptic->Nfields; - elliptic->Nhalo = mesh.totalHaloPairs*mesh.Np*elliptic->Nfields; + Ndofs = mesh.Nelements*mesh.Np*Nfields; + Nhalo = mesh.totalHaloPairs*mesh.Np*Nfields; } if (settings.compareSetting("PRECONDITIONER", "JACOBI")) - elliptic->precon = new JacobiPrecon(*elliptic); + precon.Setup(*this); else if(settings.compareSetting("PRECONDITIONER", "MASSMATRIX")) - elliptic->precon = new MassMatrixPrecon(*elliptic); + precon.Setup(*this); else if(settings.compareSetting("PRECONDITIONER", "PARALMOND")) - elliptic->precon = new ParAlmondPrecon(*elliptic); + precon.Setup(*this); else if(settings.compareSetting("PRECONDITIONER", "MULTIGRID")) - elliptic->precon = new MultiGridPrecon(*elliptic); + precon.Setup(*this); else if(settings.compareSetting("PRECONDITIONER", "SEMFEM")) - elliptic->precon = new SEMFEMPrecon(*elliptic); + precon.Setup(*this); else if(settings.compareSetting("PRECONDITIONER", "OAS")) - elliptic->precon = new OASPrecon(*elliptic); + precon.Setup(*this); else if(settings.compareSetting("PRECONDITIONER", "NONE")) - elliptic->precon = new IdentityPrecon(elliptic->Ndofs); - - return *elliptic; -} - -elliptic_t::~elliptic_t() { - maskKernel.free(); - partialAxKernel.free(); - partialGradientKernel.free(); - partialIpdgKernel.free(); - - if (traceHalo) traceHalo->Free(); - if (ogsMasked) ogsMasked->Free(); - if (precon) delete precon; + precon.Setup(Ndofs); } diff --git a/solvers/elliptic/src/ellipticSetupNewDegree.cpp b/solvers/elliptic/src/ellipticSetupNewDegree.cpp index 77c941f1a..163aa42a6 100644 --- a/solvers/elliptic/src/ellipticSetupNewDegree.cpp +++ b/solvers/elliptic/src/ellipticSetupNewDegree.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,44 +27,31 @@ SOFTWARE. #include "elliptic.hpp" #include "ellipticPrecon.hpp" -elliptic_t& elliptic_t::SetupNewDegree(mesh_t& meshC){ +elliptic_t elliptic_t::SetupNewDegree(mesh_t& meshC){ //if asking for the same degree, return the original solver if (meshC.N == mesh.N) return *this; - elliptic_t* elliptic = new elliptic_t(platform, meshC, settings, lambda); - //shallow copy - elliptic->Nfields = Nfields; - elliptic->lambda = lambda; - - elliptic->disc_ipdg = disc_ipdg; - elliptic->disc_c0 = disc_c0; - - elliptic->grad = grad; - elliptic->o_grad = o_grad; + elliptic_t elliptic = *this; - elliptic->o_AqL = o_AqL; - - elliptic->BCType = BCType; - - elliptic->maskKernel = maskKernel; + elliptic.mesh = meshC; /*setup trace halo exchange */ - elliptic->traceHalo = meshC.HaloTraceSetup(elliptic->Nfields); + elliptic.traceHalo = meshC.HaloTraceSetup(Nfields); //setup boundary flags and make mask and masked ogs - elliptic->BoundarySetup(); + elliptic.BoundarySetup(); //tau (penalty term in IPDG) if (settings.compareSetting("DISCRETIZATION","IPDG")) { - if (meshC.elementType==TRIANGLES || - meshC.elementType==QUADRILATERALS){ - elliptic->tau = 2.0*(meshC.N+1)*(meshC.N+2)/2.0; + if (meshC.elementType==Mesh::TRIANGLES || + meshC.elementType==Mesh::QUADRILATERALS){ + elliptic.tau = 2.0*(meshC.N+1)*(meshC.N+2)/2.0; if(meshC.dim==3) - elliptic->tau *= 1.5; + elliptic.tau *= 1.5; } else - elliptic->tau = 2.0*(meshC.N+1)*(meshC.N+3); + elliptic.tau = 2.0*(meshC.N+1)*(meshC.N+3); //buffer for gradient (Reuse the original buffer) // dlong Ntotal = meshC.Np*(meshC.Nelements+meshC.totalHaloPairs); @@ -73,78 +60,83 @@ elliptic_t& elliptic_t::SetupNewDegree(mesh_t& meshC){ } // OCCA build stuff - occa::properties kernelInfo = meshC.props; //copy base occa properties + properties_t kernelInfo = meshC.props; //copy base occa properties // set kernel name suffix - char *suffix; - if(meshC.elementType==TRIANGLES){ + std::string suffix; + if(meshC.elementType==Mesh::TRIANGLES){ if(meshC.dim==2) - suffix = strdup("Tri2D"); + suffix = "Tri2D"; else - suffix = strdup("Tri3D"); - } else if(meshC.elementType==QUADRILATERALS){ + suffix = "Tri3D"; + } else if(meshC.elementType==Mesh::QUADRILATERALS){ if(meshC.dim==2) - suffix = strdup("Quad2D"); + suffix = "Quad2D"; else - suffix = strdup("Quad3D"); - } else if(meshC.elementType==TETRAHEDRA) - suffix = strdup("Tet3D"); - else if(meshC.elementType==HEXAHEDRA) - suffix = strdup("Hex3D"); + suffix = "Quad3D"; + } else if(meshC.elementType==Mesh::TETRAHEDRA) + suffix = "Tet3D"; + else if(meshC.elementType==Mesh::HEXAHEDRA) + suffix = "Hex3D"; + + std::string oklFilePrefix = DELLIPTIC "/okl/"; + std::string oklFileSuffix = ".okl"; - char fileName[BUFSIZ], kernelName[BUFSIZ]; + std::string fileName, kernelName; //add standard boundary functions - char *boundaryHeaderFileName; + std::string boundaryHeaderFileName; if (meshC.dim==2) - boundaryHeaderFileName = strdup(DELLIPTIC "/data/ellipticBoundary2D.h"); + boundaryHeaderFileName = std::string(DELLIPTIC "/data/ellipticBoundary2D.h"); else if (meshC.dim==3) - boundaryHeaderFileName = strdup(DELLIPTIC "/data/ellipticBoundary3D.h"); + boundaryHeaderFileName = std::string(DELLIPTIC "/data/ellipticBoundary3D.h"); kernelInfo["includes"] += boundaryHeaderFileName; int blockMax = 256; if (platform.device.mode() == "CUDA") blockMax = 512; - int NblockV = mymax(1,blockMax/meshC.Np); + int NblockV = std::max(1,blockMax/meshC.Np); kernelInfo["defines/" "p_NblockV"]= NblockV; // Ax kernel if (settings.compareSetting("DISCRETIZATION","CONTINUOUS")) { - sprintf(fileName, DELLIPTIC "/okl/ellipticAx%s.okl", suffix); - if(meshC.elementType==HEXAHEDRA){ + fileName = oklFilePrefix + "ellipticAx" + suffix + oklFileSuffix; + if(meshC.elementType==Mesh::HEXAHEDRA){ if(mesh.settings.compareSetting("ELEMENT MAP", "TRILINEAR")) - sprintf(kernelName, "ellipticPartialAxTrilinear%s", suffix); + kernelName = "ellipticPartialAxTrilinear" + suffix; else - sprintf(kernelName, "ellipticPartialAx%s", suffix); + kernelName = "ellipticPartialAx" + suffix; } else{ - sprintf(kernelName, "ellipticPartialAx%s", suffix); + kernelName = "ellipticPartialAx" + suffix; } - elliptic->partialAxKernel = platform.buildKernel(fileName, kernelName, + elliptic.partialAxKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } else if (settings.compareSetting("DISCRETIZATION","IPDG")) { - int Nmax = mymax(meshC.Np, meshC.Nfaces*meshC.Nfp); + int Nmax = std::max(meshC.Np, meshC.Nfaces*meshC.Nfp); kernelInfo["defines/" "p_Nmax"]= Nmax; - sprintf(fileName, DELLIPTIC "/okl/ellipticGradient%s.okl", suffix); - sprintf(kernelName, "ellipticPartialGradient%s", suffix); - elliptic->partialGradientKernel = platform.buildKernel(fileName, kernelName, + fileName = oklFilePrefix + "ellipticGradient" + suffix + oklFileSuffix; + kernelName = "ellipticPartialGradient" + suffix; + elliptic.partialGradientKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(fileName, DELLIPTIC "/okl/ellipticAxIpdg%s.okl", suffix); - sprintf(kernelName, "ellipticPartialAxIpdg%s", suffix); - elliptic->partialIpdgKernel = platform.buildKernel(fileName, kernelName, + fileName = oklFilePrefix + "ellipticAxIpdg" + suffix + oklFileSuffix; + kernelName = "ellipticPartialAxIpdg" + suffix; + elliptic.partialIpdgKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } if (settings.compareSetting("DISCRETIZATION", "CONTINUOUS")) { - elliptic->Ndofs = elliptic->ogsMasked->Ngather*elliptic->Nfields; + elliptic.Ndofs = elliptic.ogsMasked.Ngather*Nfields; + elliptic.Nhalo = elliptic.gHalo.Nhalo*Nfields; } else { - elliptic->Ndofs = meshC.Nelements*meshC.Np*elliptic->Nfields; + elliptic.Ndofs = meshC.Nelements*meshC.Np*Nfields; + elliptic.Nhalo = meshC.totalHaloPairs*meshC.Np*Nfields; } - elliptic->precon = NULL; + elliptic.precon = precon_t(); - return *elliptic; + return elliptic; } diff --git a/solvers/elliptic/src/ellipticSetupRingPatch.cpp b/solvers/elliptic/src/ellipticSetupRingPatch.cpp index 92f8b0281..7a3a4bd7b 100644 --- a/solvers/elliptic/src/ellipticSetupRingPatch.cpp +++ b/solvers/elliptic/src/ellipticSetupRingPatch.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,59 +27,41 @@ SOFTWARE. #include "elliptic.hpp" #include "ellipticPrecon.hpp" -elliptic_t* elliptic_t::SetupRingPatch(mesh_t& meshPatch){ +elliptic_t elliptic_t::SetupRingPatch(mesh_t& meshPatch){ //just reuse the current solver if there are no neighbors - if (mesh.size == 1) return this; - - elliptic_t* elliptic = new elliptic_t(platform, meshPatch, settings, lambda); + if (mesh.size == 1) return *this; //shallow copy - elliptic->Nfields = Nfields; - elliptic->lambda = lambda; + elliptic_t elliptic = *this; - elliptic->disc_ipdg = disc_ipdg; - elliptic->disc_c0 = disc_c0; + elliptic.mesh = meshPatch; + elliptic.comm = meshPatch.comm; //buffer for gradient if (settings.compareSetting("DISCRETIZATION","IPDG")) { dlong Ntotal = meshPatch.Np*meshPatch.Nelements; - elliptic->grad = (dfloat*) calloc(Ntotal*4, sizeof(dfloat)); - elliptic->o_grad = platform.malloc(Ntotal*4*sizeof(dfloat), elliptic->grad); + elliptic.grad.malloc(Ntotal*4, 0.0); + elliptic.o_grad = platform.malloc(elliptic.grad); } else { //buffer for local Ax dlong Ntotal = meshPatch.Np*meshPatch.Nelements; - elliptic->o_AqL = platform.malloc(Ntotal*sizeof(dfloat)); + elliptic.o_AqL = platform.malloc(Ntotal); } - //tau (penalty term in IPDG) - elliptic->tau = tau; /*setup trace halo exchange */ - elliptic->traceHalo = meshPatch.HaloTraceSetup(elliptic->Nfields); - - elliptic->BCType = BCType; - - elliptic->maskKernel = maskKernel; + elliptic.traceHalo = meshPatch.HaloTraceSetup(Nfields); //setup boundary flags and make mask and masked ogs - elliptic->BoundarySetup(); - - - // Ax kernel - if (settings.compareSetting("DISCRETIZATION","CONTINUOUS")) { - elliptic->partialAxKernel = partialAxKernel; - } else if (settings.compareSetting("DISCRETIZATION","IPDG")) { - elliptic->partialGradientKernel = partialGradientKernel; - elliptic->partialIpdgKernel = partialIpdgKernel; - } + elliptic.BoundarySetup(); if (settings.compareSetting("DISCRETIZATION", "CONTINUOUS")) { - elliptic->Ndofs = elliptic->ogsMasked->Ngather*elliptic->Nfields; + elliptic.Ndofs = elliptic.ogsMasked.Ngather*Nfields; } else { - elliptic->Ndofs = meshPatch.Nelements*meshPatch.Np*elliptic->Nfields; + elliptic.Ndofs = meshPatch.Nelements*meshPatch.Np*Nfields; } - elliptic->precon = NULL; + elliptic.precon = precon_t(); return elliptic; } diff --git a/solvers/elliptic/src/ellipticSolve.cpp b/solvers/elliptic/src/ellipticSolve.cpp index e6e385edb..be75efa3a 100644 --- a/solvers/elliptic/src/ellipticSolve.cpp +++ b/solvers/elliptic/src/ellipticSolve.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,13 +27,13 @@ SOFTWARE. #include "elliptic.hpp" int elliptic_t::Solve(linearSolver_t& linearSolver, - occa::memory &o_x, occa::memory &o_r, + deviceMemory &o_x, deviceMemory &o_r, const dfloat tol, const int MAXIT, const int verbose){ // if there is a nullspace, remove the constant vector from r if(allNeumann) ZeroMean(o_r); - int Niter = linearSolver.Solve(*this, *precon, o_x, o_r, tol, MAXIT, verbose); + int Niter = linearSolver.Solve(*this, precon, o_x, o_r, tol, MAXIT, verbose); return Niter; } diff --git a/solvers/elliptic/src/ellipticZeroMean.cpp b/solvers/elliptic/src/ellipticZeroMean.cpp index 1a5560486..3233bd566 100644 --- a/solvers/elliptic/src/ellipticZeroMean.cpp +++ b/solvers/elliptic/src/ellipticZeroMean.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) - Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,13 +26,12 @@ #include "elliptic.hpp" -void elliptic_t::ZeroMean(occa::memory &o_q){ +void elliptic_t::ZeroMean(deviceMemory &o_q){ - dfloat qmean = linAlg.sum(Ndofs, o_q, mesh.comm); + dfloat qmean = platform.linAlg().sum(Ndofs, o_q, mesh.comm); // normalize qmean *= allNeumannScale*allNeumannScale; - // q[n] = q[n] - qmean - platform.linAlg.add(Ndofs, -qmean, o_q); + platform.linAlg().add(Ndofs, -qmean, o_q); } diff --git a/solvers/fokkerPlanck/data/fpeLinear2D.h b/solvers/fokkerPlanck/data/fpeLinear2D.h index 014feb0fa..b73fcefbc 100644 --- a/solvers/fokkerPlanck/data/fpeLinear2D.h +++ b/solvers/fokkerPlanck/data/fpeLinear2D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -66,4 +66,4 @@ SOFTWARE. *(qxB) = 0.0; \ *(qyB) = 0.0; \ } \ -} \ No newline at end of file +} diff --git a/solvers/fokkerPlanck/data/fpeLinear3D.h b/solvers/fokkerPlanck/data/fpeLinear3D.h index 4d36c316a..a00f02095 100644 --- a/solvers/fokkerPlanck/data/fpeLinear3D.h +++ b/solvers/fokkerPlanck/data/fpeLinear3D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -72,4 +72,4 @@ SOFTWARE. *(qyB) = 0.0; \ *(qzB) = 0.0; \ } \ -} \ No newline at end of file +} diff --git a/solvers/fokkerPlanck/fpe.hpp b/solvers/fokkerPlanck/fpe.hpp index 912730587..c0e360fa9 100644 --- a/solvers/fokkerPlanck/fpe.hpp +++ b/solvers/fokkerPlanck/fpe.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) - Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -37,48 +37,47 @@ #define DFPE LIBP_DIR"/solvers/fokkerPlanck/" +using namespace libp; + class fpeSettings_t: public settings_t { public: - fpeSettings_t(MPI_Comm& _comm); + fpeSettings_t(comm_t& _comm); void report(); void parseFromFile(platformSettings_t& platformSettings, meshSettings_t& meshSettings, - const string filename); + const std::string filename); - ellipticSettings_t* extractEllipticSettings(); + ellipticSettings_t extractEllipticSettings(); }; class fpe_t; class subcycler_t: public solver_t { public: - mesh_t& mesh; + mesh_t mesh; int cubature; - halo_t* traceHalo; - occa::kernel advectionVolumeKernel; - occa::kernel advectionSurfaceKernel; - - subcycler_t() = delete; - subcycler_t(fpe_t& fpe); + ogs::halo_t traceHalo; + kernel_t advectionVolumeKernel; + kernel_t advectionSurfaceKernel; - ~subcycler_t(){}; + subcycler_t() = default; void Report(dfloat time, int tstep){}; - void rhsf(occa::memory& o_q, occa::memory& o_rhs, const dfloat time); + void rhsf(deviceMemory& o_q, deviceMemory& o_rhs, const dfloat time); }; class fpe_t: public solver_t { public: - mesh_t& mesh; - TimeStepper::timeStepper_t* timeStepper; + mesh_t mesh; + timeStepper_t timeStepper; - halo_t* traceHalo; + ogs::halo_t traceHalo; - ellipticSettings_t *ellipticSettings; - elliptic_t *elliptic; - linearSolver_t *linearSolver; + ellipticSettings_t ellipticSettings; + elliptic_t elliptic; + linearSolver_t linearSolver; int Nfields; @@ -87,60 +86,59 @@ class fpe_t: public solver_t { dfloat mu; dfloat tau; - dfloat *q; - occa::memory o_q; + memory q; + deviceMemory o_q; - occa::memory o_Mq; + deviceMemory o_Mq; - dfloat *grad; - occa::memory o_grad; + memory grad; + deviceMemory o_grad; //subcycling int Nsubcycles; - TimeStepper::timeStepper_t* subStepper; - subcycler_t *subcycler; + timeStepper_t subStepper; + subcycler_t subcycler; - occa::kernel advectionVolumeKernel; - occa::kernel advectionSurfaceKernel; - occa::kernel gradientKernel; - occa::kernel diffusionKernel; - occa::kernel diffusionRhsKernel; + kernel_t advectionVolumeKernel; + kernel_t advectionSurfaceKernel; + kernel_t gradientKernel; + kernel_t diffusionKernel; + kernel_t diffusionRhsKernel; - occa::kernel initialConditionKernel; - occa::kernel maxWaveSpeedKernel; + kernel_t initialConditionKernel; + kernel_t maxWaveSpeedKernel; - fpe_t() = delete; + fpe_t() = default; fpe_t(platform_t &_platform, mesh_t &_mesh, - settings_t& _settings): - solver_t(_platform, _settings), mesh(_mesh) {} - - ~fpe_t(); + fpeSettings_t& _settings) { + Setup(_platform, _mesh, _settings); + } //setup - static fpe_t& Setup(platform_t& platform, mesh_t& mesh, - fpeSettings_t& settings); + void Setup(platform_t& _platform, mesh_t& _mesh, + fpeSettings_t& _settings); void Run(); void Report(dfloat time, int tstep); - void PlotFields(dfloat* Q, char *fileName); + void PlotFields(memory& Q, std::string fileName); - dfloat MaxWaveSpeed(occa::memory& o_Q, const dfloat T); + dfloat MaxWaveSpeed(deviceMemory& o_Q, const dfloat T); - void rhsf(occa::memory& o_q, occa::memory& o_rhs, const dfloat time); + void rhsf(deviceMemory& o_q, deviceMemory& o_rhs, const dfloat time); - void rhs_imex_f(occa::memory& o_q, occa::memory& o_rhs, const dfloat time); - void rhs_imex_g(occa::memory& o_q, occa::memory& o_rhs, const dfloat time); + void rhs_imex_f(deviceMemory& o_q, deviceMemory& o_rhs, const dfloat time); + void rhs_imex_g(deviceMemory& o_q, deviceMemory& o_rhs, const dfloat time); - void rhs_imex_invg(occa::memory& o_q, occa::memory& o_rhs, const dfloat gamma, const dfloat time); + void rhs_imex_invg(deviceMemory& o_q, deviceMemory& o_rhs, const dfloat gamma, const dfloat time); - void rhs_subcycle_f(occa::memory& o_Q, occa::memory& o_QHAT, - const dfloat T, const dfloat dt, const dfloat* B, - const int order, const int shiftIndex, const int maxOrder); + void rhs_subcycle_f(deviceMemory& o_Q, deviceMemory& o_QHAT, + const dfloat T, const dfloat dt, const memory B, + const int order, const int shiftIndex, const int maxOrder); - void Advection(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T); - void Diffusion(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T); + void Advection(deviceMemory& o_Q, deviceMemory& o_RHS, const dfloat T); + void Diffusion(deviceMemory& o_Q, deviceMemory& o_RHS, const dfloat T); }; #endif diff --git a/solvers/fokkerPlanck/fpeMain.cpp b/solvers/fokkerPlanck/fpeMain.cpp index abf5f9d83..3eaa8a6e0 100644 --- a/solvers/fokkerPlanck/fpeMain.cpp +++ b/solvers/fokkerPlanck/fpeMain.cpp @@ -1,7 +1,7 @@ /* The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -28,40 +28,41 @@ SOFTWARE. int main(int argc, char **argv){ // start up MPI - MPI_Init(&argc, &argv); + Comm::Init(argc, argv); - MPI_Comm comm = MPI_COMM_WORLD; + LIBP_ABORT("Usage: ./fpeMain setupfile", argc!=2); - if(argc!=2) - LIBP_ABORT(string("Usage: ./fpeMain setupfile")); + { /*Scope so everything is destructed before MPI_Finalize */ + comm_t comm(Comm::World().Dup()); - //create default settings - platformSettings_t platformSettings(comm); - meshSettings_t meshSettings(comm); - fpeSettings_t fpeSettings(comm); + //create default settings + platformSettings_t platformSettings(comm); + meshSettings_t meshSettings(comm); + fpeSettings_t fpeSettings(comm); - //load settings from file - fpeSettings.parseFromFile(platformSettings, meshSettings, - argv[1]); + //load settings from file + fpeSettings.parseFromFile(platformSettings, meshSettings, + argv[1]); - // set up platform - platform_t platform(platformSettings); + // set up platform + platform_t platform(platformSettings); - platformSettings.report(); - meshSettings.report(); - fpeSettings.report(); + platformSettings.report(); + meshSettings.report(); + fpeSettings.report(); - // set up mesh - mesh_t& mesh = mesh_t::Setup(platform, meshSettings, comm); + // set up mesh + mesh_t mesh(platform, meshSettings, comm); - // set up fpe solver - fpe_t& fpe = fpe_t::Setup(platform, mesh, fpeSettings); + // set up fpe solver + fpe_t fpe(platform, mesh, fpeSettings); - // run - fpe.Run(); + // run + fpe.Run(); + } // close down MPI - MPI_Finalize(); + Comm::Finalize(); return LIBP_SUCCESS; } diff --git a/solvers/fokkerPlanck/makefile b/solvers/fokkerPlanck/makefile index 1a11fa09a..340800e29 100644 --- a/solvers/fokkerPlanck/makefile +++ b/solvers/fokkerPlanck/makefile @@ -2,7 +2,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal @@ -78,29 +78,25 @@ include ../../make.top endif endif -#gslib -GS_DIR=${LIBP_TPL_DIR}/gslib - #libraries ELLIPTIC_DIR =${LIBP_DIR}/solvers/elliptic -FPE_LIBP_LIBS=timeStepper linearSolver parAlmond mesh ogs linAlg core +FPE_LIBP_LIBS=timeStepper linearSolver parAlmond mesh parAdogs ogs linAlg core #includes INCLUDES=-I${ELLIPTIC_DIR} \ - ${LIBP_INCLUDES} \ - -I. + ${LIBP_INCLUDES} \ + -I. #defines DEFINES =${LIBP_DEFINES} \ -DLIBP_DIR='"${LIBP_DIR}"' #.cpp compilation flags -FPE_CXXFLAGS=${LIBP_MPICXXFLAGS} ${DEFINES} ${INCLUDES} +FPE_CXXFLAGS=${LIBP_CXXFLAGS} ${DEFINES} ${INCLUDES} #link libraries LIBS=-L${ELLIPTIC_DIR} -lelliptic \ - -L${LIBP_LIBS_DIR} $(addprefix -l,$(FPE_LIBP_LIBS)) \ - -L$(GS_DIR)/lib -lgs \ + -L${LIBP_LIBS_DIR} $(addprefix -l,$(FPE_LIBP_LIBS)) \ ${LIBP_LIBS} #link flags @@ -156,10 +152,10 @@ endif # rule for .cpp files %.o: %.cpp $(DEPS) | libelliptic ifneq (,${verbose}) - $(LIBP_MPICXX) -o $*.o -c $*.cpp $(FPE_CXXFLAGS) + $(LIBP_CXX) -o $*.o -c $*.cpp $(FPE_CXXFLAGS) else @printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n"; - @$(LIBP_MPICXX) -o $*.o -c $*.cpp $(FPE_CXXFLAGS) + @$(LIBP_CXX) -o $*.o -c $*.cpp $(FPE_CXXFLAGS) endif #cleanup @@ -171,8 +167,7 @@ clean-libs: clean ${MAKE} -C ${LIBP_LIBS_DIR} clean clean-kernels: clean-libs -# $(shell ${OCCA_DIR}/bin/occa clear all -y) - rm -rf ~/.occa/ + rm -rf ${LIBP_DIR}/.occa/ realclean: clean ${MAKE} -C ${ELLIPTIC_DIR} clean diff --git a/solvers/fokkerPlanck/okl/fpeAdvectionHex3D.okl b/solvers/fokkerPlanck/okl/fpeAdvectionHex3D.okl index ee60ada81..671c6a0b1 100644 --- a/solvers/fokkerPlanck/okl/fpeAdvectionHex3D.okl +++ b/solvers/fokkerPlanck/okl/fpeAdvectionHex3D.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -58,8 +58,6 @@ SOFTWARE. } } - @barrier("local"); - #pragma unroll p_Nq for(int k=0;k - @barrier("local"); for(int j=0;j - @barrier("local"); for(int j=0;j& Q, std::string fileName){ FILE *fp; - fp = fopen(fileName, "w"); + fp = fopen(fileName.c_str(), "w"); fprintf(fp, "\n"); fprintf(fp, " \n"); @@ -44,30 +44,36 @@ void fpe_t::PlotFields(dfloat* Q, char *fileName){ fprintf(fp, " \n"); //scratch space for interpolation - size_t NscratchBytes = mymax(mesh.Np, mesh.plotNp)*sizeof(dfloat); - dfloat* scratch = (dfloat *) malloc(2*NscratchBytes); + size_t Nscratch = std::max(mesh.Np, mesh.plotNp); + memory scratch(2*Nscratch); - dfloat* Ix = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iy = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iz = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); + memory Ix(mesh.plotNp); + memory Iy(mesh.plotNp); + memory Iz(mesh.plotNp); // compute plot node coordinates on the fly for(dlong e=0;e\n"); fprintf(fp, " \n"); - free(Ix); free(Iy); free(Iz); - - dfloat* Ip = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); + memory Ip(mesh.plotNp); // write out field fprintf(fp, " \n"); @@ -83,8 +89,6 @@ void fpe_t::PlotFields(dfloat* Q, char *fileName){ fprintf(fp, " \n"); fprintf(fp, " \n"); - free(Ip); - fprintf(fp, " \n"); fprintf(fp, " \n"); @@ -125,6 +129,4 @@ void fpe_t::PlotFields(dfloat* Q, char *fileName){ fprintf(fp, " \n"); fprintf(fp, "\n"); fclose(fp); - - free(scratch); } diff --git a/solvers/fokkerPlanck/src/fpeReport.cpp b/solvers/fokkerPlanck/src/fpeReport.cpp index d1f76988f..57a7e2ca7 100644 --- a/solvers/fokkerPlanck/src/fpeReport.cpp +++ b/solvers/fokkerPlanck/src/fpeReport.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -34,7 +34,7 @@ void fpe_t::Report(dfloat time, int tstep){ mesh.MassMatrixApply(o_q, o_Mq); dlong Nentries = mesh.Nelements*mesh.Np; - dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_q, o_Mq, mesh.comm)); + dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_q, o_Mq, mesh.comm)); if(mesh.rank==0) printf("%5.2f (%d), %5.2f (time, timestep, norm)\n", time, tstep, norm2); @@ -45,11 +45,11 @@ void fpe_t::Report(dfloat time, int tstep){ o_q.copyTo(q); // output field files - string name; + std::string name; settings.getSetting("OUTPUT FILE NAME", name); char fname[BUFSIZ]; sprintf(fname, "%s_%04d_%04d.vtu", name.c_str(), mesh.rank, frame++); - PlotFields(q, fname); + PlotFields(q, std::string(fname)); } } diff --git a/solvers/fokkerPlanck/src/fpeRun.cpp b/solvers/fokkerPlanck/src/fpeRun.cpp index 8db0aecd0..149e849b3 100644 --- a/solvers/fokkerPlanck/src/fpeRun.cpp +++ b/solvers/fokkerPlanck/src/fpeRun.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -54,14 +54,14 @@ void fpe_t::Run(){ dt = dtAdvc; } else if (settings.compareSetting("TIME INTEGRATOR","SSBDF3")) { dt = Nsubcycles*dtAdvc; - subStepper->SetTimeStep(dtAdvc); + subStepper.SetTimeStep(dtAdvc); } else { - dt = mymin(dtAdvc, dtDiff); + dt = std::min(dtAdvc, dtDiff); } - timeStepper->SetTimeStep(dt); + timeStepper.SetTimeStep(dt); - timeStepper->Run(o_q, startTime, finalTime); + timeStepper.Run(*this, o_q, startTime, finalTime); // output norm of final solution { @@ -69,7 +69,7 @@ void fpe_t::Run(){ mesh.MassMatrixApply(o_q, o_Mq); dlong Nentries = mesh.Nelements*mesh.Np; - dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_q, o_Mq, mesh.comm)); + dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_q, o_Mq, mesh.comm)); if(mesh.rank==0) printf("Solution norm = %17.15lg\n", norm2); diff --git a/solvers/fokkerPlanck/src/fpeSettings.cpp b/solvers/fokkerPlanck/src/fpeSettings.cpp index 036849eeb..c4acb693b 100644 --- a/solvers/fokkerPlanck/src/fpeSettings.cpp +++ b/solvers/fokkerPlanck/src/fpeSettings.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,7 +27,7 @@ SOFTWARE. #include "fpe.hpp" //settings for fpe solver -fpeSettings_t::fpeSettings_t(MPI_Comm& _comm): +fpeSettings_t::fpeSettings_t(comm_t& _comm): settings_t(_comm) { newSetting("DATA FILE", @@ -87,10 +87,7 @@ fpeSettings_t::fpeSettings_t(MPI_Comm& _comm): void fpeSettings_t::report() { - int rank; - MPI_Comm_rank(comm, &rank); - - if (rank==0) { + if (comm.rank()==0) { std::cout << "Fokker Planck Settings:\n\n"; reportSetting("DATA FILE"); reportSetting("VISCOSITY"); @@ -132,15 +129,15 @@ void fpeSettings_t::report() { void fpeSettings_t::parseFromFile(platformSettings_t& platformSettings, meshSettings_t& meshSettings, - const string filename) { + const std::string filename) { //read all settings from file settings_t s(comm); s.readSettingsFromFile(filename); for(auto it = s.settings.begin(); it != s.settings.end(); ++it) { - setting_t* set = it->second; - const string name = set->getName(); - const string val = set->getVal(); + setting_t& set = it->second; + const std::string name = set.getName(); + const std::string val = set.getVal(); if (platformSettings.hasSetting(name)) platformSettings.changeSetting(name, val); else if (meshSettings.hasSetting(name)) @@ -148,26 +145,24 @@ void fpeSettings_t::parseFromFile(platformSettings_t& platformSettings, else if (hasSetting(name)) //self changeSetting(name, val); else { - stringstream ss; - ss << "Unknown setting: [" << name << "] requested"; - LIBP_ABORT(ss.str()); + LIBP_FORCE_ABORT("Unknown setting: [" << name << "] requested"); } } } -ellipticSettings_t* fpeSettings_t::extractEllipticSettings() { +ellipticSettings_t fpeSettings_t::extractEllipticSettings() { - ellipticSettings_t* ellipticSettings = new ellipticSettings_t(comm); + ellipticSettings_t ellipticSettings(comm); - for(auto it = ellipticSettings->settings.begin(); it != ellipticSettings->settings.end(); ++it) { - setting_t* set = it->second; - const string name = set->getName(); + for(auto it = ellipticSettings.settings.begin(); it != ellipticSettings.settings.end(); ++it) { + setting_t& set = it->second; + const std::string name = set.getName(); - string val; + std::string val; getSetting("ELLIPTIC "+name, val); - set->updateVal(val); + set.updateVal(val); } return ellipticSettings; -} \ No newline at end of file +} diff --git a/solvers/fokkerPlanck/src/fpeSetup.cpp b/solvers/fokkerPlanck/src/fpeSetup.cpp index c75123f52..947d32c48 100644 --- a/solvers/fokkerPlanck/src/fpeSetup.cpp +++ b/solvers/fokkerPlanck/src/fpeSetup.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,19 +26,25 @@ SOFTWARE. #include "fpe.hpp" -fpe_t& fpe_t::Setup(platform_t& platform, mesh_t& mesh, - fpeSettings_t& settings){ +void fpe_t::Setup(platform_t& _platform, mesh_t& _mesh, + fpeSettings_t& _settings){ - fpe_t* fpe = new fpe_t(platform, mesh, settings); + platform = _platform; + mesh = _mesh; + comm = _mesh.comm; + settings = _settings; - settings.getSetting("VISCOSITY", fpe->mu); + //Trigger JIT kernel builds + ogs::InitializeKernels(platform, ogs::Dfloat, ogs::Add); - fpe->cubature = (settings.compareSetting("ADVECTION TYPE", "CUBATURE")) ? 1:0; + settings.getSetting("VISCOSITY", mu); + + cubature = (settings.compareSetting("ADVECTION TYPE", "CUBATURE")) ? 1:0; //setup cubature - if (fpe->cubature) { + if (cubature) { mesh.CubatureSetup(); - mesh.CubatureNodes(); + mesh.CubaturePhysicalNodes(); } dlong Nlocal = mesh.Nelements*mesh.Np; @@ -47,145 +53,171 @@ fpe_t& fpe_t::Setup(platform_t& platform, mesh_t& mesh, //setup timeStepper dfloat gamma = 0.0; if (settings.compareSetting("TIME INTEGRATOR","AB3")){ - fpe->timeStepper = new TimeStepper::ab3(mesh.Nelements, mesh.totalHaloPairs, - mesh.Np, 1, *fpe); + timeStepper.Setup(mesh.Nelements, + mesh.totalHaloPairs, + mesh.Np, 1, platform, comm); } else if (settings.compareSetting("TIME INTEGRATOR","LSERK4")){ - fpe->timeStepper = new TimeStepper::lserk4(mesh.Nelements, mesh.totalHaloPairs, - mesh.Np, 1, *fpe); + timeStepper.Setup(mesh.Nelements, + mesh.totalHaloPairs, + mesh.Np, 1, platform, comm); } else if (settings.compareSetting("TIME INTEGRATOR","DOPRI5")){ - fpe->timeStepper = new TimeStepper::dopri5(mesh.Nelements, mesh.totalHaloPairs, - mesh.Np, 1, *fpe, mesh.comm); + timeStepper.Setup(mesh.Nelements, + mesh.totalHaloPairs, + mesh.Np, 1, platform, comm); } else if (settings.compareSetting("TIME INTEGRATOR","EXTBDF3")){ - fpe->timeStepper = new TimeStepper::extbdf3(mesh.Nelements, mesh.totalHaloPairs, - mesh.Np, 1, *fpe); - gamma = ((TimeStepper::extbdf3*) fpe->timeStepper)->getGamma(); + timeStepper.Setup(mesh.Nelements, + mesh.totalHaloPairs, + mesh.Np, 1, platform, comm); + gamma = timeStepper.GetGamma(); } else if (settings.compareSetting("TIME INTEGRATOR","SSBDF3")){ - fpe->timeStepper = new TimeStepper::ssbdf3(mesh.Nelements, mesh.totalHaloPairs, - mesh.Np, 1, *fpe); - gamma = ((TimeStepper::ssbdf3*) fpe->timeStepper)->getGamma(); + timeStepper.Setup(mesh.Nelements, + mesh.totalHaloPairs, + mesh.Np, 1, platform, comm); + gamma = timeStepper.GetGamma(); } - fpe->Nsubcycles=1; + Nsubcycles=1; if (settings.compareSetting("TIME INTEGRATOR","SSBDF3")) - settings.getSetting("NUMBER OF SUBCYCLES", fpe->Nsubcycles); + settings.getSetting("NUMBER OF SUBCYCLES", Nsubcycles); //Setup Elliptic solver - fpe->elliptic=NULL; - fpe->linearSolver=NULL; if (settings.compareSetting("TIME INTEGRATOR","EXTBDF3") ||settings.compareSetting("TIME INTEGRATOR","SSBDF3")){ int NBCTypes = 7; - int BCType[NBCTypes] = {0,1,1,2,1,1,1}; // bc=3 => outflow => Neumann => vBCType[3] = 2, etc. - - fpe->ellipticSettings = settings.extractEllipticSettings(); + memory BCType(NBCTypes); + // bc=3 => outflow => Neumann => vBCType[3] = 2, etc. + BCType[0] = 0; + BCType[1] = 1; + BCType[2] = 1; + BCType[3] = 2; + BCType[4] = 1; + BCType[5] = 1; + BCType[6] = 1; + + ellipticSettings = _settings.extractEllipticSettings(); //make a guess at dt for the lambda value //TODO: we should allow preconditioners to be re-setup if lambda is updated dfloat hmin = mesh.MinCharacteristicLength(); - dfloat dtAdvc = fpe->Nsubcycles*hmin/((mesh.N+1.)*(mesh.N+1.)); - dfloat lambda = gamma/(dtAdvc*fpe->mu); - - fpe->elliptic = &(elliptic_t::Setup(platform, mesh, *(fpe->ellipticSettings), - lambda, NBCTypes, BCType)); - fpe->tau = fpe->elliptic->tau; - - fpe->linearSolver = linearSolver_t::Setup(fpe->elliptic->Ndofs, fpe->elliptic->Nhalo, - platform, *(fpe->ellipticSettings), mesh.comm); + dfloat dtAdvc = Nsubcycles*hmin/((mesh.N+1.)*(mesh.N+1.)); + dfloat lambda = gamma/(dtAdvc*mu); + + elliptic.Setup(platform, mesh, ellipticSettings, + lambda, NBCTypes, BCType); + tau = elliptic.tau; + + if (ellipticSettings.compareSetting("LINEAR SOLVER","NBPCG")){ + linearSolver.Setup(elliptic.Ndofs, elliptic.Nhalo, + platform, ellipticSettings, comm); + } else if (ellipticSettings.compareSetting("LINEAR SOLVER","NBFPCG")){ + linearSolver.Setup(elliptic.Ndofs, elliptic.Nhalo, + platform, ellipticSettings, comm); + } else if (ellipticSettings.compareSetting("LINEAR SOLVER","PCG")){ + linearSolver.Setup(elliptic.Ndofs, elliptic.Nhalo, + platform, ellipticSettings, comm); + } else if (ellipticSettings.compareSetting("LINEAR SOLVER","PGMRES")){ + linearSolver.Setup(elliptic.Ndofs, elliptic.Nhalo, + platform, ellipticSettings, comm); + } else if (ellipticSettings.compareSetting("LINEAR SOLVER","PMINRES")){ + linearSolver.Setup(elliptic.Ndofs, elliptic.Nhalo, + platform, ellipticSettings, comm); + } } else { //set penalty - if (mesh.elementType==TRIANGLES || - mesh.elementType==QUADRILATERALS){ - fpe->tau = 2.0*(mesh.N+1)*(mesh.N+2)/2.0; + if (mesh.elementType==Mesh::TRIANGLES || + mesh.elementType==Mesh::QUADRILATERALS){ + tau = 2.0*(mesh.N+1)*(mesh.N+2)/2.0; if(mesh.dim==3) - fpe->tau *= 1.5; + tau *= 1.5; } else - fpe->tau = 2.0*(mesh.N+1)*(mesh.N+3); + tau = 2.0*(mesh.N+1)*(mesh.N+3); } //setup linear algebra module - platform.linAlg.InitKernels({"innerProd", "axpy", "max"}); + platform.linAlg().InitKernels({"innerProd", "axpy", "max"}); /*setup trace halo exchange */ - fpe->traceHalo = mesh.HaloTraceSetup(1); //one field + traceHalo = mesh.HaloTraceSetup(1); //one field // compute samples of q at interpolation nodes - fpe->q = (dfloat*) calloc(Nlocal+Nhalo, sizeof(dfloat)); - fpe->o_q = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), fpe->q); + q.malloc(Nlocal+Nhalo, 0.0); + o_q = platform.malloc(q); //storage for M*q during reporting - fpe->o_Mq = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), fpe->q); + o_Mq = platform.malloc(q); mesh.MassMatrixKernelSetup(1); // mass matrix operator - fpe->grad = (dfloat*) calloc((Nlocal+Nhalo)*4, sizeof(dfloat)); - fpe->o_grad = platform.malloc((Nlocal+Nhalo)*4*sizeof(dfloat), fpe->grad); + grad.malloc((Nlocal+Nhalo)*4, 0.0); + o_grad = platform.malloc(grad); // OCCA build stuff - occa::properties kernelInfo = mesh.props; //copy base occa properties + properties_t kernelInfo = mesh.props; //copy base occa properties //add boundary data to kernel info - string dataFileName; + std::string dataFileName; settings.getSetting("DATA FILE", dataFileName); kernelInfo["includes"] += dataFileName; kernelInfo["defines/" "p_Nfields"]= 1; - int maxNodes = mymax(mesh.Np, (mesh.Nfp*mesh.Nfaces)); + int maxNodes = std::max(mesh.Np, (mesh.Nfp*mesh.Nfaces)); kernelInfo["defines/" "p_maxNodes"]= maxNodes; int blockMax = 256; if (platform.device.mode() == "CUDA") blockMax = 512; - int NblockV = mymax(1, blockMax/mesh.Np); + int NblockV = std::max(1, blockMax/mesh.Np); kernelInfo["defines/" "p_NblockV"]= NblockV; - int NblockS = mymax(1, blockMax/maxNodes); + int NblockS = std::max(1, blockMax/maxNodes); kernelInfo["defines/" "p_NblockS"]= NblockS; - if (fpe->cubature) { - int cubMaxNodes = mymax(mesh.Np, (mesh.intNfp*mesh.Nfaces)); + if (cubature) { + int cubMaxNodes = std::max(mesh.Np, (mesh.intNfp*mesh.Nfaces)); kernelInfo["defines/" "p_cubMaxNodes"]= cubMaxNodes; - int cubMaxNodes1 = mymax(mesh.Np, (mesh.intNfp)); + int cubMaxNodes1 = std::max(mesh.Np, (mesh.intNfp)); kernelInfo["defines/" "p_cubMaxNodes1"]= cubMaxNodes1; - int cubNblockV = mymax(1, blockMax/mesh.cubNp); + int cubNblockV = std::max(1, blockMax/mesh.cubNp); kernelInfo["defines/" "p_cubNblockV"]= cubNblockV; - int cubNblockS = mymax(1, blockMax/cubMaxNodes); + int cubNblockS = std::max(1, blockMax/cubMaxNodes); kernelInfo["defines/" "p_cubNblockS"]= cubNblockS; } - kernelInfo["parser/" "automate-add-barriers"] = "disabled"; - // set kernel name suffix - char *suffix; - if(mesh.elementType==TRIANGLES) - suffix = strdup("Tri2D"); - if(mesh.elementType==QUADRILATERALS) - suffix = strdup("Quad2D"); - if(mesh.elementType==TETRAHEDRA) - suffix = strdup("Tet3D"); - if(mesh.elementType==HEXAHEDRA) - suffix = strdup("Hex3D"); - - char fileName[BUFSIZ], kernelName[BUFSIZ]; + std::string suffix; + if(mesh.elementType==Mesh::TRIANGLES) + suffix = "Tri2D"; + if(mesh.elementType==Mesh::QUADRILATERALS) + suffix = "Quad2D"; + if(mesh.elementType==Mesh::TETRAHEDRA) + suffix = "Tet3D"; + if(mesh.elementType==Mesh::HEXAHEDRA) + suffix = "Hex3D"; + + std::string oklFilePrefix = DFPE "/okl/"; + std::string oklFileSuffix = ".okl"; + + std::string fileName, kernelName; // advection kernels - if (fpe->cubature) { - sprintf(fileName, DFPE "/okl/fpeCubatureAdvection%s.okl", suffix); - sprintf(kernelName, "fpeAdvectionCubatureVolume%s", suffix); - fpe->advectionVolumeKernel = platform.buildKernel(fileName, kernelName, + if (cubature) { + fileName = oklFilePrefix + "fpeCubatureAdvection" + suffix + oklFileSuffix; + kernelName = "fpeAdvectionCubatureVolume" + suffix; + advectionVolumeKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(kernelName, "fpeAdvectionCubatureSurface%s", suffix); - fpe->advectionSurfaceKernel = platform.buildKernel(fileName, kernelName, + kernelName = "fpeAdvectionCubatureSurface" + suffix; + advectionSurfaceKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } else { - sprintf(fileName, DFPE "/okl/fpeAdvection%s.okl", suffix); - sprintf(kernelName, "fpeAdvectionVolume%s", suffix); - fpe->advectionVolumeKernel = platform.buildKernel(fileName, kernelName, + fileName = oklFilePrefix + "fpeAdvection" + suffix + oklFileSuffix; + kernelName = "fpeAdvectionVolume" + suffix; + advectionVolumeKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(kernelName, "fpeAdvectionSurface%s", suffix); - fpe->advectionSurfaceKernel = platform.buildKernel(fileName, kernelName, + kernelName = "fpeAdvectionSurface" + suffix; + advectionSurfaceKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } @@ -193,72 +225,63 @@ fpe_t& fpe_t::Setup(platform_t& platform, mesh_t& mesh, // diffusion kernels if (settings.compareSetting("TIME INTEGRATOR","EXTBDF3") ||settings.compareSetting("TIME INTEGRATOR","SSBDF3")) { - sprintf(fileName, DFPE "/okl/fpeDiffusionRhs%s.okl", suffix); - sprintf(kernelName, "fpeDiffusionRhs%s", suffix); - fpe->diffusionRhsKernel = platform.buildKernel(fileName, kernelName, + fileName = oklFilePrefix + "fpeDiffusionRhs" + suffix + oklFileSuffix; + kernelName = "fpeDiffusionRhs" + suffix; + diffusionRhsKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } else { // gradient kernel - sprintf(fileName, DFPE "/okl/fpeGradient%s.okl", suffix); - sprintf(kernelName, "fpeGradient%s", suffix); - fpe->gradientKernel = platform.buildKernel(fileName, kernelName, + fileName = oklFilePrefix + "fpeGradient" + suffix + oklFileSuffix; + kernelName = "fpeGradient" + suffix; + gradientKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(fileName, DFPE "/okl/fpeDiffusion%s.okl", suffix); - sprintf(kernelName, "fpeDiffusion%s", suffix); - fpe->diffusionKernel = platform.buildKernel(fileName, kernelName, + fileName = oklFilePrefix + "fpeDiffusion" + suffix + oklFileSuffix; + kernelName = "fpeDiffusion" + suffix; + diffusionKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } if (mesh.dim==2) { - sprintf(fileName, DFPE "/okl/fpeInitialCondition2D.okl"); - sprintf(kernelName, "fpeInitialCondition2D"); + fileName = oklFilePrefix + "fpeInitialCondition2D" + oklFileSuffix; + kernelName = "fpeInitialCondition2D"; } else { - sprintf(fileName, DFPE "/okl/fpeInitialCondition3D.okl"); - sprintf(kernelName, "fpeInitialCondition3D"); + fileName = oklFilePrefix + "fpeInitialCondition3D" + oklFileSuffix; + kernelName = "fpeInitialCondition3D"; } - fpe->initialConditionKernel = platform.buildKernel(fileName, kernelName, + initialConditionKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(fileName, DFPE "/okl/fpeMaxWaveSpeed%s.okl", suffix); - sprintf(kernelName, "fpeMaxWaveSpeed%s", suffix); + fileName = oklFilePrefix + "fpeMaxWaveSpeed" + suffix + oklFileSuffix; + kernelName = "fpeMaxWaveSpeed" + suffix; - fpe->maxWaveSpeedKernel = platform.buildKernel(fileName, kernelName, kernelInfo); + maxWaveSpeedKernel = platform.buildKernel(fileName, kernelName, kernelInfo); //build subcycler - fpe->subcycler=NULL; - fpe->subStepper=NULL; if (settings.compareSetting("TIME INTEGRATOR","SSBDF3")) { - fpe->subcycler = new subcycler_t(*fpe); + subcycler.platform = platform; + subcycler.mesh = mesh; + subcycler.comm = comm; + subcycler.settings = settings; + + subcycler.cubature = cubature; + subcycler.traceHalo = traceHalo; + subcycler.advectionVolumeKernel = advectionVolumeKernel; + subcycler.advectionSurfaceKernel = advectionSurfaceKernel; + if (settings.compareSetting("SUBCYCLING TIME INTEGRATOR","AB3")){ - fpe->subStepper = new TimeStepper::ab3(mesh.Nelements, mesh.totalHaloPairs, - mesh.Np, 1, *(fpe->subcycler)); + subStepper.Setup(mesh.Nelements, + mesh.totalHaloPairs, + mesh.Np, 1, platform, comm); } else if (settings.compareSetting("SUBCYCLING TIME INTEGRATOR","LSERK4")){ - fpe->subStepper = new TimeStepper::lserk4(mesh.Nelements, mesh.totalHaloPairs, - mesh.Np, 1, *(fpe->subcycler)); + subStepper.Setup(mesh.Nelements, + mesh.totalHaloPairs, + mesh.Np, 1, platform, comm); } else if (settings.compareSetting("SUBCYCLING TIME INTEGRATOR","DOPRI5")){ - fpe->subStepper = new TimeStepper::dopri5(mesh.Nelements, mesh.totalHaloPairs, - mesh.Np, 1, *(fpe->subcycler), mesh.comm); + subStepper.Setup(mesh.Nelements, + mesh.totalHaloPairs, + mesh.Np, 1, platform, comm); } } - - return *fpe; -} - -fpe_t::~fpe_t() { - advectionVolumeKernel.free(); - advectionSurfaceKernel.free(); - gradientKernel.free(); - diffusionKernel.free(); - diffusionRhsKernel.free(); - initialConditionKernel.free(); - maxWaveSpeedKernel.free(); - - if (elliptic) delete elliptic; - if (timeStepper) delete timeStepper; - if (linearSolver) delete linearSolver; - if (subStepper) delete subStepper; - if (subcycler) delete subcycler; - if (traceHalo) traceHalo->Free(); } diff --git a/solvers/fokkerPlanck/src/fpeStep.cpp b/solvers/fokkerPlanck/src/fpeStep.cpp index cf5b63fbf..1c22d332b 100644 --- a/solvers/fokkerPlanck/src/fpeStep.cpp +++ b/solvers/fokkerPlanck/src/fpeStep.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,10 +26,10 @@ SOFTWARE. #include "fpe.hpp" -dfloat fpe_t::MaxWaveSpeed(occa::memory& o_Q, const dfloat T){ +dfloat fpe_t::MaxWaveSpeed(deviceMemory& o_Q, const dfloat T){ //Note: if this is on the critical path in the future, we should pre-allocate this - occa::memory o_maxSpeed = platform.malloc(mesh.Nelements*sizeof(dfloat)); + deviceMemory o_maxSpeed = platform.malloc(mesh.Nelements); maxWaveSpeedKernel(mesh.Nelements, mesh.o_vgeo, @@ -43,31 +43,30 @@ dfloat fpe_t::MaxWaveSpeed(occa::memory& o_Q, const dfloat T){ o_Q, o_maxSpeed); - const dfloat vmax = platform.linAlg.max(mesh.Nelements, o_maxSpeed, mesh.comm); + const dfloat vmax = platform.linAlg().max(mesh.Nelements, o_maxSpeed, mesh.comm); - o_maxSpeed.free(); return vmax; } //evaluate ODE rhs = f(q,t) -void fpe_t::rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){ +void fpe_t::rhsf(deviceMemory& o_Q, deviceMemory& o_RHS, const dfloat T){ Advection(o_Q, o_RHS, T); Diffusion(o_Q, o_RHS, T); } // Evaluation of rhs f function -void fpe_t::rhs_imex_f(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){ +void fpe_t::rhs_imex_f(deviceMemory& o_Q, deviceMemory& o_RHS, const dfloat T){ Advection(o_Q, o_RHS, T); } // Evaluation of rhs g function -void fpe_t::rhs_imex_g(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){ +void fpe_t::rhs_imex_g(deviceMemory& o_Q, deviceMemory& o_RHS, const dfloat T){ Diffusion(o_Q, o_RHS, T); } // Inversion of diffusion operator // Solves gamma*q - mu*Laplacian*q = rhs -void fpe_t::rhs_imex_invg(occa::memory& o_RHS, occa::memory& o_Q, const dfloat gamma, const dfloat T){ +void fpe_t::rhs_imex_invg(deviceMemory& o_RHS, deviceMemory& o_Q, const dfloat gamma, const dfloat T){ // rhs = MM*rhs/mu diffusionRhsKernel(mesh.Nelements, @@ -91,9 +90,9 @@ void fpe_t::rhs_imex_invg(occa::memory& o_RHS, occa::memory& o_Q, const dfloat g int verbose =0; //call the solver to solve -Laplacian*q + lambda*q = rhs - dfloat tol = 1e-8; - elliptic->lambda = gamma/mu; - int iter = elliptic->Solve(*linearSolver, o_Q, o_RHS, tol, maxIter, verbose); + dfloat tol = (sizeof(dfloat)==sizeof(double)) ? 1.0e-8 : 1.0e-5; + elliptic.lambda = gamma/mu; + int iter = elliptic.Solve(linearSolver, o_Q, o_RHS, tol, maxIter, verbose); if (mesh.rank==0){ printf("\rSolver iterations: %3d. ", iter); fflush(stdout); @@ -101,8 +100,8 @@ void fpe_t::rhs_imex_invg(occa::memory& o_RHS, occa::memory& o_Q, const dfloat g } // Evolve rhs f function via a sub-timestepper -void fpe_t::rhs_subcycle_f(occa::memory& o_Q, occa::memory& o_QHAT, - const dfloat T, const dfloat dt, const dfloat* B, +void fpe_t::rhs_subcycle_f(deviceMemory& o_Q, deviceMemory& o_QHAT, + const dfloat T, const dfloat dt, const memory B, const int order, const int shiftIndex, const int maxOrder) { //subcycle each Lagrangian state qhat by stepping dqhat/dt = F(qhat,t) @@ -118,21 +117,21 @@ void fpe_t::rhs_subcycle_f(occa::memory& o_Q, occa::memory& o_QHAT, for (int n=order;n>=0;n--) { //for each history state, starting with oldest //q at t-n*dt - occa::memory o_Qn = o_Q + ((shiftIndex+n)%maxOrder)*N*sizeof(dfloat); + deviceMemory o_Qn = o_Q + ((shiftIndex+n)%maxOrder)*N; //next scaled partial sum - platform.linAlg.axpy(N, B[n+1]/(B[n+1]+bSum), o_Qn, - bSum/(B[n+1]+bSum), o_QHAT); + platform.linAlg().axpy(N, B[n+1]/(B[n+1]+bSum), o_Qn, + bSum/(B[n+1]+bSum), o_QHAT); bSum += B[n+1]; - subStepper->Run(o_QHAT, T-n*dt, T-(n-1)*dt); + subStepper.Run(subcycler, o_QHAT, T-n*dt, T-(n-1)*dt); } } -void fpe_t::Advection(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T) { +void fpe_t::Advection(deviceMemory& o_Q, deviceMemory& o_RHS, const dfloat T) { // extract q halo on DEVICE - traceHalo->ExchangeStart(o_Q, 1, ogs_dfloat); + traceHalo.ExchangeStart(o_Q, 1); if (cubature) advectionVolumeKernel(mesh.Nelements, @@ -159,7 +158,7 @@ void fpe_t::Advection(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T) { o_Q, o_RHS); - traceHalo->ExchangeFinish(o_Q, 1, ogs_dfloat); + traceHalo.ExchangeFinish(o_Q, 1); if (cubature) advectionSurfaceKernel(mesh.Nelements, @@ -191,7 +190,7 @@ void fpe_t::Advection(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T) { o_RHS); } -void fpe_t::Diffusion(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T) { +void fpe_t::Diffusion(deviceMemory& o_Q, deviceMemory& o_RHS, const dfloat T) { //compute gradq and pack with q gradientKernel(mesh.Nelements, @@ -200,7 +199,7 @@ void fpe_t::Diffusion(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T) { o_Q, o_grad); - traceHalo->ExchangeStart(o_grad, 4, ogs_dfloat); + traceHalo.ExchangeStart(o_grad, 4); if(mesh.NinternalElements) diffusionKernel(mesh.NinternalElements, @@ -221,7 +220,7 @@ void fpe_t::Diffusion(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T) { o_grad, o_RHS); - traceHalo->ExchangeFinish(o_grad, 4, ogs_dfloat); + traceHalo.ExchangeFinish(o_grad, 4); if(mesh.NhaloElements) diffusionKernel(mesh.NhaloElements, @@ -241,4 +240,4 @@ void fpe_t::Diffusion(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T) { mu, o_grad, o_RHS); -} \ No newline at end of file +} diff --git a/solvers/fokkerPlanck/src/fpeSubcycle.cpp b/solvers/fokkerPlanck/src/fpeSubcycle.cpp index 9ad2c2b6d..82211d416 100644 --- a/solvers/fokkerPlanck/src/fpeSubcycle.cpp +++ b/solvers/fokkerPlanck/src/fpeSubcycle.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,19 +26,10 @@ SOFTWARE. #include "fpe.hpp" -subcycler_t::subcycler_t(fpe_t& fpe): - solver_t(fpe.platform, fpe.settings), mesh(fpe.mesh) { - - cubature = fpe.cubature; - traceHalo = fpe.traceHalo; - advectionVolumeKernel = fpe.advectionVolumeKernel; - advectionSurfaceKernel = fpe.advectionSurfaceKernel; -} - //evaluate ODE rhs = f(q,t) -void subcycler_t::rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){ +void subcycler_t::rhsf(deviceMemory& o_Q, deviceMemory& o_RHS, const dfloat T){ // extract q halo on DEVICE - traceHalo->ExchangeStart(o_Q, 1, ogs_dfloat); + traceHalo.ExchangeStart(o_Q, 1); if (cubature) advectionVolumeKernel(mesh.Nelements, @@ -65,7 +56,7 @@ void subcycler_t::rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){ o_Q, o_RHS); - traceHalo->ExchangeFinish(o_Q, 1, ogs_dfloat); + traceHalo.ExchangeFinish(o_Q, 1); if (cubature) advectionSurfaceKernel(mesh.Nelements, diff --git a/solvers/gradient/data/gradientCos2D.h b/solvers/gradient/data/gradientCos2D.h index f29987648..7c81d8ac9 100644 --- a/solvers/gradient/data/gradientCos2D.h +++ b/solvers/gradient/data/gradientCos2D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/gradient/data/gradientCos3D.h b/solvers/gradient/data/gradientCos3D.h index 4585eaa57..9c0f2ca06 100644 --- a/solvers/gradient/data/gradientCos3D.h +++ b/solvers/gradient/data/gradientCos3D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/gradient/gradient.hpp b/solvers/gradient/gradient.hpp index 1920031d2..39fc0adc3 100644 --- a/solvers/gradient/gradient.hpp +++ b/solvers/gradient/gradient.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -34,43 +34,44 @@ SOFTWARE. #define DGRADIENT LIBP_DIR"/solvers/gradient/" +using namespace libp; + class gradientSettings_t: public settings_t { public: - gradientSettings_t(MPI_Comm& _comm); + gradientSettings_t(comm_t _comm); void report(); void parseFromFile(platformSettings_t& platformSettings, meshSettings_t& meshSettings, - const string filename); + const std::string filename); }; class gradient_t: public solver_t { public: - mesh_t& mesh; + mesh_t mesh; int Nfields; - dfloat *q; - occa::memory o_q; + memory q; + deviceMemory o_q; - dfloat *gradq; - occa::memory o_gradq; + memory gradq; + deviceMemory o_gradq; - occa::memory o_Mgradq; + deviceMemory o_Mgradq; - occa::kernel volumeKernel; + kernel_t volumeKernel; - occa::kernel initialConditionKernel; + kernel_t initialConditionKernel; - gradient_t() = delete; + gradient_t() = default; gradient_t(platform_t &_platform, mesh_t &_mesh, - gradientSettings_t& _settings): - solver_t(_platform, _settings), mesh(_mesh) {} - - ~gradient_t(); + gradientSettings_t& _settings) { + Setup(_platform, _mesh, _settings); + } //setup - static gradient_t& Setup(platform_t& platform, mesh_t& mesh, - gradientSettings_t& settings); + void Setup(platform_t& _platform, mesh_t& _mesh, + gradientSettings_t& _settings); void Run(); diff --git a/solvers/gradient/gradientMain.cpp b/solvers/gradient/gradientMain.cpp index 076b6924d..30bbd2085 100644 --- a/solvers/gradient/gradientMain.cpp +++ b/solvers/gradient/gradientMain.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -29,39 +29,40 @@ SOFTWARE. int main(int argc, char **argv){ // start up MPI - MPI_Init(&argc, &argv); + Comm::Init(argc, argv); - MPI_Comm comm = MPI_COMM_WORLD; + LIBP_ABORT("Usage: ./gradientMain setupfile", argc!=2); - if(argc!=2) - LIBP_ABORT(string("Usage: ./gradientMain setupfile")); + { /*Scope so everything is destructed before MPI_Finalize */ + comm_t comm(Comm::World().Dup()); - //create default settings - platformSettings_t platformSettings(comm); - meshSettings_t meshSettings(comm); - gradientSettings_t gradientSettings(comm); + //create default settings + platformSettings_t platformSettings(comm); + meshSettings_t meshSettings(comm); + gradientSettings_t gradientSettings(comm); - //load settings from file - gradientSettings.parseFromFile(platformSettings, meshSettings, - argv[1]); + //load settings from file + gradientSettings.parseFromFile(platformSettings, meshSettings, + argv[1]); - // set up platform - platform_t platform(platformSettings); + // set up platform + platform_t platform(platformSettings); - platformSettings.report(); - meshSettings.report(); - gradientSettings.report(); + platformSettings.report(); + meshSettings.report(); + gradientSettings.report(); - // set up mesh - mesh_t& mesh = mesh_t::Setup(platform, meshSettings, comm); + // set up mesh + mesh_t mesh(platform, meshSettings, comm); - // set up gradient solver - gradient_t& gradient = gradient_t::Setup(platform, mesh, gradientSettings); + // set up gradient solver + gradient_t gradient(platform, mesh, gradientSettings); - // run - gradient.Run(); + // run + gradient.Run(); + } // close down MPI - MPI_Finalize(); + Comm::Finalize(); return LIBP_SUCCESS; } diff --git a/solvers/gradient/makefile b/solvers/gradient/makefile index ae03306c6..ee3ba7d66 100644 --- a/solvers/gradient/makefile +++ b/solvers/gradient/makefile @@ -2,7 +2,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal @@ -78,11 +78,8 @@ include ../../make.top endif endif -#gslib -GS_DIR=${LIBP_TPL_DIR}/gslib - #libraries -GRADIENT_LIBP_LIBS=mesh ogs linAlg core +GRADIENT_LIBP_LIBS=mesh parAdogs ogs linAlg core #includes INCLUDES=${LIBP_INCLUDES} \ @@ -97,7 +94,6 @@ GRADIENT_CXXFLAGS=${LIBP_CXXFLAGS} ${DEFINES} ${INCLUDES} #link libraries LIBS=-L${LIBP_LIBS_DIR} $(addprefix -l,$(GRADIENT_LIBP_LIBS)) \ - -L$(GS_DIR)/lib -lgs \ ${LIBP_LIBS} #link flags @@ -145,10 +141,10 @@ endif # rule for .cpp files %.o: %.cpp $(DEPS) | libp_libs ifneq (,${verbose}) - $(LIBP_MPICXX) -o $*.o -c $*.cpp $(GRADIENT_CXXFLAGS) + $(LIBP_CXX) -o $*.o -c $*.cpp $(GRADIENT_CXXFLAGS) else @printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n"; - @$(LIBP_MPICXX) -o $*.o -c $*.cpp $(GRADIENT_CXXFLAGS) + @$(LIBP_CXX) -o $*.o -c $*.cpp $(GRADIENT_CXXFLAGS) endif #cleanup @@ -159,8 +155,7 @@ clean-libs: clean ${MAKE} -C ${LIBP_LIBS_DIR} clean clean-kernels: clean-libs -# $(shell ${OCCA_DIR}/bin/occa clear all -y) - rm -rf ~/.occa/ + rm -rf ${LIBP_DIR}/.occa/ realclean: clean ${MAKE} -C ${LIBP_LIBS_DIR} realclean diff --git a/solvers/gradient/okl/gradientInitialCondition2D.okl b/solvers/gradient/okl/gradientInitialCondition2D.okl index de6cc2fd3..1086743dd 100644 --- a/solvers/gradient/okl/gradientInitialCondition2D.okl +++ b/solvers/gradient/okl/gradientInitialCondition2D.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/gradient/okl/gradientInitialCondition3D.okl b/solvers/gradient/okl/gradientInitialCondition3D.okl index 8f8a09829..69c33fc9f 100644 --- a/solvers/gradient/okl/gradientInitialCondition3D.okl +++ b/solvers/gradient/okl/gradientInitialCondition3D.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/gradient/okl/gradientVolumeHex3D.okl b/solvers/gradient/okl/gradientVolumeHex3D.okl index cc997bf4b..7ece12235 100644 --- a/solvers/gradient/okl/gradientVolumeHex3D.okl +++ b/solvers/gradient/okl/gradientVolumeHex3D.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -36,29 +36,23 @@ for(dlong e=0;e\n"); //scratch space for interpolation - size_t NscratchBytes = mymax(mesh.Np, mesh.plotNp)*sizeof(dfloat); - dfloat* scratch = (dfloat *) malloc(2*NscratchBytes); + size_t Nscratch = std::max(mesh.Np, mesh.plotNp); + memory scratch(2*Nscratch); - dfloat* Ix = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iy = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iz = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); + memory Ix(mesh.plotNp); + memory Iy(mesh.plotNp); + memory Iz(mesh.plotNp); // compute plot node coordinates on the fly for(dlong e=0;e\n"); fprintf(fp, " \n"); - free(Ix); free(Iy); free(Iz); - - dfloat* Iq = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iu = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iv = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iw = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); + memory Iq(mesh.plotNp); + memory Iu(mesh.plotNp); + memory Iv(mesh.plotNp); + memory Iw(mesh.plotNp); // write out q fprintf(fp, " \n"); @@ -107,8 +113,6 @@ void gradient_t::PlotFields(){ fprintf(fp, " \n"); fprintf(fp, " \n"); - free(Iq); free(Iu); free(Iv); free(Iw); - fprintf(fp, " \n"); fprintf(fp, " \n"); @@ -149,6 +153,4 @@ void gradient_t::PlotFields(){ fprintf(fp, " \n"); fprintf(fp, "\n"); fclose(fp); - - free(scratch); } diff --git a/solvers/gradient/src/gradientReport.cpp b/solvers/gradient/src/gradientReport.cpp index 1d00441d0..220ff2a31 100644 --- a/solvers/gradient/src/gradientReport.cpp +++ b/solvers/gradient/src/gradientReport.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -32,7 +32,7 @@ void gradient_t::Report(){ mesh.MassMatrixApply(o_gradq, o_Mgradq); dlong Nentries = mesh.Nelements*mesh.Np*Nfields; - dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_gradq, o_Mgradq, mesh.comm)); + dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_gradq, o_Mgradq, mesh.comm)); if(mesh.rank==0) printf("%5.2f (norm)\n", norm2); diff --git a/solvers/gradient/src/gradientRun.cpp b/solvers/gradient/src/gradientRun.cpp index a9121dee9..0f05a878f 100644 --- a/solvers/gradient/src/gradientRun.cpp +++ b/solvers/gradient/src/gradientRun.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -48,7 +48,7 @@ void gradient_t::Run(){ mesh.MassMatrixApply(o_gradq, o_Mgradq); dlong Nentries = mesh.Nelements*mesh.Np*Nfields; - dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_gradq, o_Mgradq, mesh.comm)); + dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_gradq, o_Mgradq, mesh.comm)); if(mesh.rank==0) printf("Solution norm = %17.15lg\n", norm2); diff --git a/solvers/gradient/src/gradientSettings.cpp b/solvers/gradient/src/gradientSettings.cpp index 0a2da238d..a37b7fe7c 100644 --- a/solvers/gradient/src/gradientSettings.cpp +++ b/solvers/gradient/src/gradientSettings.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,7 +27,7 @@ SOFTWARE. #include "gradient.hpp" //settings for gradient solver -gradientSettings_t::gradientSettings_t(MPI_Comm& _comm): +gradientSettings_t::gradientSettings_t(comm_t _comm): settings_t(_comm) { newSetting("DATA FILE", @@ -42,10 +42,7 @@ gradientSettings_t::gradientSettings_t(MPI_Comm& _comm): void gradientSettings_t::report() { - int rank; - MPI_Comm_rank(comm, &rank); - - if (rank==0) { + if (comm.rank()==0) { std::cout << "Gradient Settings:\n\n"; reportSetting("DATA FILE"); reportSetting("OUTPUT TO FILE"); @@ -53,16 +50,16 @@ void gradientSettings_t::report() { } void gradientSettings_t::parseFromFile(platformSettings_t& platformSettings, - meshSettings_t& meshSettings, - const string filename) { + meshSettings_t& meshSettings, + const std::string filename) { //read all settings from file settings_t s(comm); s.readSettingsFromFile(filename); for(auto it = s.settings.begin(); it != s.settings.end(); ++it) { - setting_t* set = it->second; - const string name = set->getName(); - const string val = set->getVal(); + setting_t& set = it->second; + const std::string name = set.getName(); + const std::string val = set.getVal(); if (platformSettings.hasSetting(name)) platformSettings.changeSetting(name, val); else if (meshSettings.hasSetting(name)) @@ -70,9 +67,7 @@ void gradientSettings_t::parseFromFile(platformSettings_t& platformSettings, else if (hasSetting(name)) //self changeSetting(name, val); else { - stringstream ss; - ss << "Unknown setting: [" << name << "] requested"; - LIBP_ABORT(ss.str()); + LIBP_FORCE_ABORT("Unknown setting: [" << name << "] requested"); } } -} \ No newline at end of file +} diff --git a/solvers/gradient/src/gradientSetup.cpp b/solvers/gradient/src/gradientSetup.cpp index 0b334ff7f..0b5efb53e 100644 --- a/solvers/gradient/src/gradientSetup.cpp +++ b/solvers/gradient/src/gradientSetup.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,76 +26,74 @@ SOFTWARE. #include "gradient.hpp" -gradient_t& gradient_t::Setup(platform_t& platform, mesh_t& mesh, - gradientSettings_t& settings){ +void gradient_t::Setup(platform_t& _platform, mesh_t& _mesh, + gradientSettings_t& _settings){ - gradient_t* gradient = new gradient_t(platform, mesh, settings); + platform = _platform; + mesh = _mesh; + comm = mesh.comm; + settings = _settings; - gradient->Nfields = mesh.dim; + Nfields = mesh.dim; dlong Nlocal = mesh.Nelements*mesh.Np; //setup linear algebra module - platform.linAlg.InitKernels({"innerProd"}); + platform.linAlg().InitKernels({"innerProd"}); // compute samples of q at interpolation nodes - gradient->q = (dfloat*) calloc(Nlocal, sizeof(dfloat)); - gradient->o_q = platform.malloc(Nlocal*sizeof(dfloat), gradient->q); + q.malloc(Nlocal); + o_q = platform.malloc(q); - gradient->gradq = (dfloat*) calloc(Nlocal*mesh.dim, sizeof(dfloat)); - gradient->o_gradq = platform.malloc(Nlocal*mesh.dim*sizeof(dfloat), gradient->gradq); + gradq.malloc(Nlocal*mesh.dim); + o_gradq = platform.malloc(gradq); //storage for M*gradq during reporting - gradient->o_Mgradq = platform.malloc(Nlocal*mesh.dim*sizeof(dfloat), gradient->gradq); - mesh.MassMatrixKernelSetup(gradient->Nfields); // mass matrix operator + o_Mgradq = platform.malloc(gradq); + mesh.MassMatrixKernelSetup(Nfields); // mass matrix operator // OCCA build stuff - occa::properties kernelInfo = mesh.props; //copy base occa properties + properties_t kernelInfo = mesh.props; //copy base occa properties //add boundary data to kernel info - string dataFileName; + std::string dataFileName; settings.getSetting("DATA FILE", dataFileName); kernelInfo["includes"] += dataFileName; - kernelInfo["defines/" "p_Nfields"]= gradient->Nfields; - - kernelInfo["parser/" "automate-add-barriers"] = "disabled"; + kernelInfo["defines/" "p_Nfields"]= Nfields; // set kernel name suffix - char *suffix; - if(mesh.elementType==TRIANGLES) - suffix = strdup("Tri2D"); - if(mesh.elementType==QUADRILATERALS) - suffix = strdup("Quad2D"); - if(mesh.elementType==TETRAHEDRA) - suffix = strdup("Tet3D"); - if(mesh.elementType==HEXAHEDRA) - suffix = strdup("Hex3D"); - - char fileName[BUFSIZ], kernelName[BUFSIZ]; + std::string suffix; + if(mesh.elementType==Mesh::TRIANGLES) + suffix = "Tri2D"; + if(mesh.elementType==Mesh::QUADRILATERALS) + suffix = "Quad2D"; + if(mesh.elementType==Mesh::TETRAHEDRA) + suffix = "Tet3D"; + if(mesh.elementType==Mesh::HEXAHEDRA) + suffix = "Hex3D"; + + std::string oklFilePrefix = DGRADIENT "/okl/"; + std::string oklFileSuffix = ".okl"; + + std::string fileName, kernelName; // kernels from volume file - sprintf(fileName, DGRADIENT "/okl/gradientVolume%s.okl", suffix); - sprintf(kernelName, "gradientVolume%s", suffix); + fileName = oklFilePrefix + "gradientVolume" + suffix + oklFileSuffix; + kernelName = "gradientVolume" + suffix; - gradient->volumeKernel = platform.buildKernel(fileName, kernelName, + volumeKernel = platform.buildKernel(fileName, kernelName, kernelInfo); if (mesh.dim==2) { - sprintf(fileName, DGRADIENT "/okl/gradientInitialCondition2D.okl"); - sprintf(kernelName, "gradientInitialCondition2D"); + fileName = oklFilePrefix + "gradientInitialCondition2D" + oklFileSuffix; + kernelName = "gradientInitialCondition2D"; } else { - sprintf(fileName, DGRADIENT "/okl/gradientInitialCondition3D.okl"); - sprintf(kernelName, "gradientInitialCondition3D"); + fileName = oklFilePrefix + "gradientInitialCondition3D" + oklFileSuffix; + kernelName = "gradientInitialCondition3D"; } - gradient->initialConditionKernel = platform.buildKernel(fileName, kernelName, + initialConditionKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - return *gradient; } - -gradient_t::~gradient_t() { - volumeKernel.free(); - initialConditionKernel.free(); -} \ No newline at end of file diff --git a/solvers/ins/data/insBeltrami3D.h b/solvers/ins/data/insBeltrami3D.h index ebce74e3f..0f30b1bdf 100644 --- a/solvers/ins/data/insBeltrami3D.h +++ b/solvers/ins/data/insBeltrami3D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/ins/data/insUniform2D.h b/solvers/ins/data/insUniform2D.h index 3e6146d72..166438443 100644 --- a/solvers/ins/data/insUniform2D.h +++ b/solvers/ins/data/insUniform2D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/ins/data/insUniform3D.h b/solvers/ins/data/insUniform3D.h index dda2129a9..eb9344af5 100644 --- a/solvers/ins/data/insUniform3D.h +++ b/solvers/ins/data/insUniform3D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/ins/data/insVortex2D.h b/solvers/ins/data/insVortex2D.h index cb7bc1510..9f683e6ad 100644 --- a/solvers/ins/data/insVortex2D.h +++ b/solvers/ins/data/insVortex2D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/ins/ins.hpp b/solvers/ins/ins.hpp index 1900f4c9b..649d704cd 100644 --- a/solvers/ins/ins.hpp +++ b/solvers/ins/ins.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) - Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -38,64 +38,62 @@ #define DINS LIBP_DIR"/solvers/ins/" +using namespace libp; + class insSettings_t: public settings_t { public: - insSettings_t(MPI_Comm& _comm); + insSettings_t(comm_t& _comm); void report(); void parseFromFile(platformSettings_t& platformSettings, meshSettings_t& meshSettings, - const string filename); + const std::string filename); - ellipticSettings_t* extractVelocitySettings(); - ellipticSettings_t* extractPressureSettings(); + ellipticSettings_t extractVelocitySettings(); + ellipticSettings_t extractPressureSettings(); }; class ins_t; class subcycler_t: public solver_t { public: - mesh_t& mesh; + mesh_t mesh; int cubature; - halo_t* vTraceHalo; - occa::kernel advectionVolumeKernel; - occa::kernel advectionSurfaceKernel; + ogs::halo_t vTraceHalo; + kernel_t advectionVolumeKernel; + kernel_t advectionSurfaceKernel; - occa::kernel subCycleAdvectionKernel; + kernel_t subCycleAdvectionKernel; int NVfields; int order, maxOrder, shiftIndex; dfloat nu, T0, dt; - occa::memory o_Ue, o_Uh; - - subcycler_t() = delete; - subcycler_t(ins_t& ins); + deviceMemory o_Ue, o_Uh; - ~subcycler_t(){}; + subcycler_t() = default; void Report(dfloat time, int tstep){}; - void rhsf(occa::memory& o_q, occa::memory& o_rhs, const dfloat time); + void rhsf(deviceMemory& o_q, deviceMemory& o_rhs, const dfloat time); }; class ins_t: public solver_t { public: - mesh_t& mesh; - linAlg_t& linAlg; - TimeStepper::timeStepper_t* timeStepper; + mesh_t mesh; + timeStepper_t timeStepper; - halo_t* vTraceHalo; - halo_t* pTraceHalo; + ogs::halo_t vTraceHalo; + ogs::halo_t pTraceHalo; - ellipticSettings_t *vSettings, *pSettings; - elliptic_t *uSolver, *vSolver, *wSolver; - elliptic_t *pSolver; + ellipticSettings_t vSettings, pSettings; + elliptic_t uSolver, vSolver, wSolver; + elliptic_t pSolver; - linearSolver_t *uLinearSolver; - linearSolver_t *vLinearSolver; - linearSolver_t *wLinearSolver; - linearSolver_t *pLinearSolver; + linearSolver_t uLinearSolver; + linearSolver_t vLinearSolver; + linearSolver_t wLinearSolver; + linearSolver_t pLinearSolver; int NVfields, NTfields; @@ -108,109 +106,103 @@ class ins_t: public solver_t { dfloat nu; dfloat vTau, pTau; - dfloat *u, *p; - occa::memory o_u, o_p; + memory u, p; + deviceMemory o_u, o_p; - occa::memory o_GU; + deviceMemory o_GU; - occa::memory o_MU; + deviceMemory o_MU; - dfloat *Vort; - occa::memory o_Vort; + memory Vort; + deviceMemory o_Vort; //extra buffers for solvers - occa::memory o_UH, o_VH, o_WH; - occa::memory o_rhsU, o_rhsV, o_rhsW; - occa::memory o_rhsP, o_PI; - - occa::memory o_GUH, o_GVH, o_GWH; - occa::memory o_GrhsU, o_GrhsV, o_GrhsW; - occa::memory o_GrhsP, o_GP, o_GPI; + deviceMemory o_UH, o_VH, o_WH; + deviceMemory o_rhsU, o_rhsV, o_rhsW; + deviceMemory o_rhsP, o_PI; - int *mapB; //node-wise boundary flag - occa::memory o_mapB; + deviceMemory o_GUH, o_GVH, o_GWH; + deviceMemory o_GrhsU, o_GrhsV, o_GrhsW; + deviceMemory o_GrhsP, o_GP, o_GPI; //subcycling int Nsubcycles; - TimeStepper::timeStepper_t* subStepper; - subcycler_t *subcycler; + timeStepper_t subStepper; + subcycler_t subcycler; - occa::kernel advectionVolumeKernel; - occa::kernel advectionSurfaceKernel; + kernel_t advectionVolumeKernel; + kernel_t advectionSurfaceKernel; - occa::kernel divergenceVolumeKernel; - occa::kernel divergenceSurfaceKernel; + kernel_t divergenceVolumeKernel; + kernel_t divergenceSurfaceKernel; - occa::kernel gradientVolumeKernel; - occa::kernel gradientSurfaceKernel; + kernel_t gradientVolumeKernel; + kernel_t gradientSurfaceKernel; - occa::kernel velocityGradientKernel; - occa::kernel diffusionKernel; + kernel_t velocityGradientKernel; + kernel_t diffusionKernel; - occa::kernel velocityRhsKernel; - occa::kernel velocityBCKernel; + kernel_t velocityRhsKernel; + kernel_t velocityBCKernel; - occa::kernel pressureRhsKernel; - occa::kernel pressureBCKernel; + kernel_t pressureRhsKernel; + kernel_t pressureBCKernel; - occa::kernel pressureIncrementRhsKernel; - occa::kernel pressureIncrementBCKernel; + kernel_t pressureIncrementRhsKernel; + kernel_t pressureIncrementBCKernel; - occa::kernel vorticityKernel; + kernel_t vorticityKernel; - occa::kernel initialConditionKernel; - occa::kernel maxWaveSpeedKernel; + kernel_t initialConditionKernel; + kernel_t maxWaveSpeedKernel; - ins_t() = delete; + ins_t() = default; ins_t(platform_t &_platform, mesh_t &_mesh, - insSettings_t& _settings): - solver_t(_platform, _settings), mesh(_mesh), linAlg(platform.linAlg) {} - - ~ins_t(); + insSettings_t& _settings) { + Setup(_platform, _mesh, _settings); + } //setup - static ins_t& Setup(platform_t& platform, mesh_t& mesh, - insSettings_t& settings); - - void BoundarySetup(); + void Setup(platform_t& _platform, mesh_t& _mesh, + insSettings_t& _settings); void Run(); void Report(dfloat time, int tstep); - void PlotFields(dfloat* U, dfloat* P, dfloat *V, char *fileName); + void PlotFields(memory& U, memory& P, memory& V, std::string fileName); - dfloat MaxWaveSpeed(occa::memory& o_U, const dfloat T); + dfloat MaxWaveSpeed(deviceMemory& o_U, const dfloat T); - // void rhsf(occa::memory& o_q, occa::memory& o_rhs, const dfloat time); + // void rhsf(deviceMemory& o_q, deviceMemory& o_rhs, const dfloat time); - void rhs_imex_f(occa::memory& o_q, occa::memory& o_rhs, const dfloat time); - // void rhs_imex_g(occa::memory& o_q, occa::memory& o_rhs, const dfloat time); + void rhs_imex_f(deviceMemory& o_q, deviceMemory& o_rhs, const dfloat time); + // void rhs_imex_g(deviceMemory& o_q, deviceMemory& o_rhs, const dfloat time); - void rhs_imex_invg(occa::memory& o_q, occa::memory& o_rhs, const dfloat gamma, const dfloat time); + void rhs_imex_invg(deviceMemory& o_q, deviceMemory& o_rhs, const dfloat gamma, const dfloat time); - void rhs_subcycle_f(occa::memory& o_Q, occa::memory& o_QHAT, - const dfloat T, const dfloat dt, const dfloat* B, - const int order, const int shiftIndex, const int maxOrder); + void rhs_subcycle_f(deviceMemory& o_Q, deviceMemory& o_QHAT, + const dfloat T, const dfloat dt, const memory B, + const int order, const int shiftIndex, const int maxOrder); - void Advection(const dfloat alpha, occa::memory& o_U, - const dfloat beta, occa::memory& o_RHS, + void Advection(const dfloat alpha, deviceMemory& o_U, + const dfloat beta, deviceMemory& o_RHS, const dfloat T); - void Diffusion(const dfloat alpha, occa::memory& o_U, - const dfloat beta, occa::memory& o_RHS, + void Diffusion(const dfloat alpha, deviceMemory& o_U, + const dfloat beta, deviceMemory& o_RHS, const dfloat T); - void Divergence(const dfloat alpha, occa::memory& o_U, - const dfloat beta, occa::memory& o_RHS, + void Divergence(const dfloat alpha, deviceMemory& o_U, + const dfloat beta, deviceMemory& o_RHS, const dfloat T); - void Gradient(const dfloat alpha, occa::memory& o_P, - const dfloat beta, occa::memory& o_RHS, + void Gradient(const dfloat alpha, deviceMemory& o_P, + const dfloat beta, deviceMemory& o_RHS, const dfloat T); - void VelocitySolve(occa::memory& o_U, occa::memory& o_RHS, + void VelocitySolve(deviceMemory& o_U, deviceMemory& o_RHS, const dfloat gamma, const dfloat T); - void PressureSolve(occa::memory& o_P, occa::memory& o_RHS, + void PressureSolve(deviceMemory& o_P, deviceMemory& o_RHS, const dfloat gamma, const dfloat T); - void PressureIncrementSolve(occa::memory& o_P, occa::memory& o_RHS, + void PressureIncrementSolve(deviceMemory& o_P, deviceMemory& o_RHS, const dfloat gamma, const dfloat T, const dfloat dt); }; diff --git a/solvers/ins/insMain.cpp b/solvers/ins/insMain.cpp index f49b16251..250c5e115 100644 --- a/solvers/ins/insMain.cpp +++ b/solvers/ins/insMain.cpp @@ -1,7 +1,7 @@ /* The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -28,40 +28,41 @@ SOFTWARE. int main(int argc, char **argv){ // start up MPI - MPI_Init(&argc, &argv); + Comm::Init(argc, argv); - MPI_Comm comm = MPI_COMM_WORLD; + LIBP_ABORT("Usage: ./insMain setupfile", argc!=2); - if(argc!=2) - LIBP_ABORT(string("Usage: ./insMain setupfile")); + { /*Scope so everything is destructed before MPI_Finalize */ + comm_t comm(Comm::World().Dup()); - //create default settings - platformSettings_t platformSettings(comm); - meshSettings_t meshSettings(comm); - insSettings_t insSettings(comm); + //create default settings + platformSettings_t platformSettings(comm); + meshSettings_t meshSettings(comm); + insSettings_t insSettings(comm); - //load settings from file - insSettings.parseFromFile(platformSettings, meshSettings, - argv[1]); + //load settings from file + insSettings.parseFromFile(platformSettings, meshSettings, + argv[1]); - // set up platform - platform_t platform(platformSettings); + // set up platform + platform_t platform(platformSettings); - platformSettings.report(); - meshSettings.report(); - insSettings.report(); + platformSettings.report(); + meshSettings.report(); + insSettings.report(); - // set up mesh - mesh_t& mesh = mesh_t::Setup(platform, meshSettings, comm); + // set up mesh + mesh_t mesh(platform, meshSettings, comm); - // set up ins solver - ins_t& ins = ins_t::Setup(platform, mesh, insSettings); + // set up ins solver + ins_t ins(platform, mesh, insSettings); - // run - ins.Run(); + // run + ins.Run(); + } // close down MPI - MPI_Finalize(); + Comm::Finalize(); return LIBP_SUCCESS; } diff --git a/solvers/ins/makefile b/solvers/ins/makefile index e87b7037c..ea73444b0 100644 --- a/solvers/ins/makefile +++ b/solvers/ins/makefile @@ -2,7 +2,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal @@ -77,29 +77,25 @@ include ../../make.top endif endif -#gslib -GS_DIR=${LIBP_TPL_DIR}/gslib - #libraries ELLIPTIC_DIR =${LIBP_DIR}/solvers/elliptic -FPE_LIBP_LIBS=timeStepper linearSolver parAlmond mesh ogs linAlg core +FPE_LIBP_LIBS=timeStepper linearSolver parAlmond mesh parAdogs ogs linAlg core #includes INCLUDES=-I${ELLIPTIC_DIR} \ - ${LIBP_INCLUDES} \ - -I. + ${LIBP_INCLUDES} \ + -I. #defines DEFINES =${LIBP_DEFINES} \ -DLIBP_DIR='"${LIBP_DIR}"' #.cpp compilation flags -INS_CXXFLAGS=${LIBP_MPICXXFLAGS} ${DEFINES} ${INCLUDES} +INS_CXXFLAGS=${LIBP_CXXFLAGS} ${DEFINES} ${INCLUDES} #link libraries LIBS=-L${ELLIPTIC_DIR} -lelliptic \ - -L${LIBP_LIBS_DIR} $(addprefix -l,$(FPE_LIBP_LIBS)) \ - -L$(GS_DIR)/lib -lgs \ + -L${LIBP_LIBS_DIR} $(addprefix -l,$(FPE_LIBP_LIBS)) \ ${LIBP_LIBS} #link flags @@ -155,10 +151,10 @@ endif # rule for .cpp files %.o: %.cpp $(DEPS) | libelliptic ifneq (,${verbose}) - $(LIBP_MPICXX) -o $*.o -c $*.cpp $(INS_CXXFLAGS) + $(LIBP_CXX) -o $*.o -c $*.cpp $(INS_CXXFLAGS) else @printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n"; - @$(LIBP_MPICXX) -o $*.o -c $*.cpp $(INS_CXXFLAGS) + @$(LIBP_CXX) -o $*.o -c $*.cpp $(INS_CXXFLAGS) endif #cleanup @@ -170,8 +166,7 @@ clean-libs: clean ${MAKE} -C ${LIBP_LIBS_DIR} clean clean-kernels: clean-libs -# $(shell ${OCCA_DIR}/bin/occa clear all -y) - rm -rf ~/.occa/ + rm -rf ${LIBP_DIR}/.occa/ realclean: clean ${MAKE} -C ${ELLIPTIC_DIR} clean diff --git a/solvers/ins/okl/insAdvectionHex3D.okl b/solvers/ins/okl/insAdvectionHex3D.okl index f9efa2cb2..7d1280891 100644 --- a/solvers/ins/okl/insAdvectionHex3D.okl +++ b/solvers/ins/okl/insAdvectionHex3D.okl @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -62,7 +62,6 @@ SOFTWARE. } } - @barrier("local"); #pragma unroll p_Nq for(int k=0;k - @barrier("local"); for(int i=0;i - @barrier("local"); for(int i=0;i - @barrier("local"); for(int i=0;i - @barrier("local"); for(int i=0;i - @barrier("local"); for(int i=0;i - @barrier("local"); for(int i=0;i& o_U, + const dfloat beta, deviceMemory& o_RHS, const dfloat T) { - vTraceHalo->ExchangeStart(o_U, 1, ogs_dfloat); + vTraceHalo.ExchangeStart(o_U, 1); if (cubature) advectionVolumeKernel(mesh.Nelements, @@ -54,7 +54,7 @@ void ins_t::Advection(const dfloat alpha, occa::memory& o_U, o_U, o_RHS); - vTraceHalo->ExchangeFinish(o_U, 1, ogs_dfloat); + vTraceHalo.ExchangeFinish(o_U, 1); if (cubature) advectionSurfaceKernel(mesh.Nelements, diff --git a/solvers/ins/src/insDiffusion.cpp b/solvers/ins/src/insDiffusion.cpp index 49f355cca..c8813d9b9 100644 --- a/solvers/ins/src/insDiffusion.cpp +++ b/solvers/ins/src/insDiffusion.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,8 +27,8 @@ SOFTWARE. #include "ins.hpp" // compute RHS = beta*RHS + alpha*L(U) -void ins_t::Diffusion(const dfloat alpha, occa::memory& o_U, - const dfloat beta, occa::memory& o_RHS, +void ins_t::Diffusion(const dfloat alpha, deviceMemory& o_U, + const dfloat beta, deviceMemory& o_RHS, const dfloat T) { //IPDG @@ -39,7 +39,7 @@ void ins_t::Diffusion(const dfloat alpha, occa::memory& o_U, o_GU); // dfloat4 storage -> 4 entries - vTraceHalo->ExchangeStart(o_GU, 4, ogs_dfloat); + vTraceHalo.ExchangeStart(o_GU, 4); if(mesh.NinternalElements) diffusionKernel(mesh.NinternalElements, @@ -62,7 +62,7 @@ void ins_t::Diffusion(const dfloat alpha, occa::memory& o_U, o_GU, o_RHS); - vTraceHalo->ExchangeFinish(o_GU, 4, ogs_dfloat); + vTraceHalo.ExchangeFinish(o_GU, 4); if(mesh.NhaloElements) diffusionKernel(mesh.NhaloElements, @@ -84,4 +84,4 @@ void ins_t::Diffusion(const dfloat alpha, occa::memory& o_U, beta, o_GU, o_RHS); -} \ No newline at end of file +} diff --git a/solvers/ins/src/insDivergence.cpp b/solvers/ins/src/insDivergence.cpp index e5366c9c7..12d838802 100644 --- a/solvers/ins/src/insDivergence.cpp +++ b/solvers/ins/src/insDivergence.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,11 +27,11 @@ SOFTWARE. #include "ins.hpp" // compute RHS = beta*RHS + alpha*div U -void ins_t::Divergence(const dfloat alpha, occa::memory& o_U, - const dfloat beta, occa::memory& o_RHS, +void ins_t::Divergence(const dfloat alpha, deviceMemory& o_U, + const dfloat beta, deviceMemory& o_RHS, const dfloat T){ - vTraceHalo->ExchangeStart(o_U, 1, ogs_dfloat); + vTraceHalo.ExchangeStart(o_U, 1); // computes div u^(n+1) volume term divergenceVolumeKernel(mesh.Nelements, @@ -42,7 +42,7 @@ void ins_t::Divergence(const dfloat alpha, occa::memory& o_U, o_U, o_RHS); - vTraceHalo->ExchangeFinish(o_U, 1, ogs_dfloat); + vTraceHalo.ExchangeFinish(o_U, 1); divergenceSurfaceKernel(mesh.Nelements, mesh.o_sgeo, diff --git a/solvers/ins/src/insGradient.cpp b/solvers/ins/src/insGradient.cpp index 88577f68f..36b2019bc 100644 --- a/solvers/ins/src/insGradient.cpp +++ b/solvers/ins/src/insGradient.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,11 +27,11 @@ SOFTWARE. #include "ins.hpp" // compute RHS = beta*RHS + alpha*grad P -void ins_t::Gradient(const dfloat alpha, occa::memory& o_P, - const dfloat beta, occa::memory& o_RHS, +void ins_t::Gradient(const dfloat alpha, deviceMemory& o_P, + const dfloat beta, deviceMemory& o_RHS, const dfloat T){ - pTraceHalo->ExchangeStart(o_P, 1, ogs_dfloat); + pTraceHalo.ExchangeStart(o_P, 1); // Compute Volume Contribution gradientVolumeKernel(mesh.Nelements, @@ -42,7 +42,7 @@ void ins_t::Gradient(const dfloat alpha, occa::memory& o_P, o_P, o_RHS); - pTraceHalo->ExchangeFinish(o_P, 1, ogs_dfloat); + pTraceHalo.ExchangeFinish(o_P, 1); // Compute Surface Conribution gradientSurfaceKernel(mesh.Nelements, diff --git a/solvers/ins/src/insPlotFields.cpp b/solvers/ins/src/insPlotFields.cpp index 1b9f375f1..e65e037d4 100644 --- a/solvers/ins/src/insPlotFields.cpp +++ b/solvers/ins/src/insPlotFields.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,11 +27,11 @@ SOFTWARE. #include "ins.hpp" // interpolate data to plot nodes and save to file (one per process -void ins_t::PlotFields(dfloat* U, dfloat* P, dfloat *V, char *fileName){ +void ins_t::PlotFields(memory& U, memory& P, memory& V, std::string fileName){ FILE *fp; - fp = fopen(fileName, "w"); + fp = fopen(fileName.c_str(), "w"); fprintf(fp, "\n"); fprintf(fp, " \n"); @@ -44,36 +44,42 @@ void ins_t::PlotFields(dfloat* U, dfloat* P, dfloat *V, char *fileName){ fprintf(fp, " \n"); //scratch space for interpolation - size_t NscratchBytes = mymax(mesh.Np, mesh.plotNp)*sizeof(dfloat); - dfloat* scratch = (dfloat *) malloc(2*NscratchBytes); + size_t Nscratch = std::max(mesh.Np, mesh.plotNp); + memory scratch(2*Nscratch); - dfloat* Ix = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iy = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iz = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); + memory Ix(mesh.plotNp); + memory Iy(mesh.plotNp); + memory Iz(mesh.plotNp); // compute plot node coordinates on the fly for(dlong e=0;e\n"); fprintf(fp, " \n"); - free(Ix); free(Iy); free(Iz); - - dfloat* Ip = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iu = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iv = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iw = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); + memory Ip(mesh.plotNp); + memory Iu(mesh.plotNp); + memory Iv(mesh.plotNp); + memory Iw(mesh.plotNp); fprintf(fp, " \n"); - if (U!=nullptr) { + if (U.length()!=0) { // write out velocity fprintf(fp, " \n", mesh.dim); for(dlong e=0;e\n"); } - if (P!=nullptr) { + if (P.length()!=0) { // write out pressure fprintf(fp, " \n"); for(dlong e=0;e\n"); } - if (V!=nullptr) { + if (V.length()!=0) { // write out vorticity if(mesh.dim==2){ fprintf(fp, " \n"); @@ -138,8 +144,6 @@ void ins_t::PlotFields(dfloat* U, dfloat* P, dfloat *V, char *fileName){ } fprintf(fp, " \n"); - free(Ip); free(Iu); free(Iv); free(Iw); - fprintf(fp, " \n"); fprintf(fp, " \n"); @@ -180,6 +184,4 @@ void ins_t::PlotFields(dfloat* U, dfloat* P, dfloat *V, char *fileName){ fprintf(fp, " \n"); fprintf(fp, "\n"); fclose(fp); - - free(scratch); } diff --git a/solvers/ins/src/insPressureIncrementSolve.cpp b/solvers/ins/src/insPressureIncrementSolve.cpp index 7c69ac544..fad714c62 100644 --- a/solvers/ins/src/insPressureIncrementSolve.cpp +++ b/solvers/ins/src/insPressureIncrementSolve.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -28,11 +28,12 @@ SOFTWARE. // Solves -gamma*Laplacian*PI = rhs // P += PI -void ins_t::PressureIncrementSolve(occa::memory& o_P, occa::memory& o_RHS, +void ins_t::PressureIncrementSolve(deviceMemory& o_P, deviceMemory& o_RHS, const dfloat gamma, const dfloat T, const dfloat dt){ // compute RHS = MM*RHS/gamma + BCdata pressureIncrementRhsKernel(mesh.Nelements, + mesh.o_wJ, mesh.o_vgeo, mesh.o_sgeo, mesh.o_ggeo, @@ -43,7 +44,7 @@ void ins_t::PressureIncrementSolve(occa::memory& o_P, occa::memory& o_RHS, mesh.o_sM, mesh.o_vmapM, mesh.o_EToB, - o_mapB, + mesh.o_mapB, pTau, T, dt, @@ -60,18 +61,18 @@ void ins_t::PressureIncrementSolve(occa::memory& o_P, occa::memory& o_RHS, // Solve - Laplacian*PI = RHS if(pDisc_c0) { // gather, solve, scatter - pSolver->ogsMasked->Gather(o_GrhsP, o_RHS, ogs_dfloat, ogs_add, ogs_trans); - NiterP = pSolver->Solve(*pLinearSolver, o_GPI, o_GrhsP, presTOL, maxIter, verbose); - pSolver->ogsMasked->Scatter(o_PI, o_GPI, ogs_dfloat, ogs_add, ogs_notrans); + pSolver.ogsMasked.Gather(o_GrhsP, o_RHS, 1, ogs::Add, ogs::Trans); + NiterP = pSolver.Solve(pLinearSolver, o_GPI, o_GrhsP, presTOL, maxIter, verbose); + pSolver.ogsMasked.Scatter(o_PI, o_GPI, 1, ogs::NoTrans); } else { - NiterP = pSolver->Solve(*pLinearSolver, o_PI, o_RHS, presTOL, maxIter, verbose); + NiterP = pSolver.Solve(pLinearSolver, o_PI, o_RHS, presTOL, maxIter, verbose); } // P += PI and enter BCs if C0 pressureIncrementBCKernel(mesh.Nelements, mesh.o_sgeo, mesh.o_vmapM, - o_mapB, + mesh.o_mapB, T, mesh.o_x, mesh.o_y, diff --git a/solvers/ins/src/insPressureSolve.cpp b/solvers/ins/src/insPressureSolve.cpp index 36d34cfe8..894e590c1 100644 --- a/solvers/ins/src/insPressureSolve.cpp +++ b/solvers/ins/src/insPressureSolve.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,11 +27,12 @@ SOFTWARE. #include "ins.hpp" // Solves -gamma*Laplacian*P = rhs -void ins_t::PressureSolve(occa::memory& o_P, occa::memory& o_RHS, +void ins_t::PressureSolve(deviceMemory& o_P, deviceMemory& o_RHS, const dfloat gamma, const dfloat T){ // compute RHS = MM*RHS/gamma + BCdata pressureRhsKernel(mesh.Nelements, + mesh.o_wJ, mesh.o_vgeo, mesh.o_sgeo, mesh.o_ggeo, @@ -42,7 +43,7 @@ void ins_t::PressureSolve(occa::memory& o_P, occa::memory& o_RHS, mesh.o_sM, mesh.o_vmapM, mesh.o_EToB, - o_mapB, + mesh.o_mapB, pTau, T, mesh.o_x, @@ -58,15 +59,15 @@ void ins_t::PressureSolve(occa::memory& o_P, occa::memory& o_RHS, if(pDisc_c0) { // gather, solve, scatter - pSolver->ogsMasked->Gather(o_GrhsP, o_RHS, ogs_dfloat, ogs_add, ogs_trans); - NiterP = pSolver->Solve(*pLinearSolver, o_GP, o_GrhsP, presTOL, maxIter, verbose); - pSolver->ogsMasked->Scatter(o_P, o_GP, ogs_dfloat, ogs_add, ogs_notrans); + pSolver.ogsMasked.Gather(o_GrhsP, o_RHS, 1, ogs::Add, ogs::Trans); + NiterP = pSolver.Solve(pLinearSolver, o_GP, o_GrhsP, presTOL, maxIter, verbose); + pSolver.ogsMasked.Scatter(o_P, o_GP, 1, ogs::NoTrans); // enter BCs if C0 pressureBCKernel(mesh.Nelements, mesh.o_sgeo, mesh.o_vmapM, - o_mapB, + mesh.o_mapB, T, mesh.o_x, mesh.o_y, @@ -74,6 +75,6 @@ void ins_t::PressureSolve(occa::memory& o_P, occa::memory& o_RHS, nu, o_P); } else { - NiterP = pSolver->Solve(*pLinearSolver, o_P, o_RHS, presTOL, maxIter, verbose); + NiterP = pSolver.Solve(pLinearSolver, o_P, o_RHS, presTOL, maxIter, verbose); } } diff --git a/solvers/ins/src/insReport.cpp b/solvers/ins/src/insReport.cpp index 03e4c77f9..4a9fa1b36 100644 --- a/solvers/ins/src/insReport.cpp +++ b/solvers/ins/src/insReport.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -34,7 +34,7 @@ void ins_t::Report(dfloat time, int tstep){ mesh.MassMatrixApply(o_u, o_MU); dlong Nentries = mesh.Nelements*mesh.Np*NVfields; - dfloat norm2 = sqrt(linAlg.innerProd(Nentries, o_u, o_MU, mesh.comm)); + dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_u, o_MU, mesh.comm)); if(mesh.rank==0) printf("\n%5.2f (%d), %5.2f (time, timestep, norm)\n", time, tstep, norm2); @@ -49,11 +49,11 @@ void ins_t::Report(dfloat time, int tstep){ o_Vort.copyTo(Vort); // output field files - string name; + std::string name; settings.getSetting("OUTPUT FILE NAME", name); char fname[BUFSIZ]; sprintf(fname, "%s_%04d_%04d.vtu", name.c_str(), mesh.rank, frame++); - PlotFields(u, p, Vort, fname); + PlotFields(u, p, Vort, std::string(fname)); } } diff --git a/solvers/ins/src/insRun.cpp b/solvers/ins/src/insRun.cpp index 4b2dca40b..9b328508a 100644 --- a/solvers/ins/src/insRun.cpp +++ b/solvers/ins/src/insRun.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -56,14 +56,14 @@ void ins_t::Run(){ dt = dtAdvc; } else if (settings.compareSetting("TIME INTEGRATOR","SSBDF3")) { dt = Nsubcycles*dtAdvc; - subStepper->SetTimeStep(dtAdvc); + subStepper.SetTimeStep(dtAdvc); } else { - dt = mymin(dtAdvc, dtDiff); + dt = std::min(dtAdvc, dtDiff); } - timeStepper->SetTimeStep(dt); + timeStepper.SetTimeStep(dt); - timeStepper->Run(o_u, startTime, finalTime); + timeStepper.Run(*this, o_u, startTime, finalTime); // output norm of final solution { @@ -71,7 +71,7 @@ void ins_t::Run(){ mesh.MassMatrixApply(o_u, o_MU); dlong Nentries = mesh.Nelements*mesh.Np*NVfields; - dfloat norm2 = sqrt(linAlg.innerProd(Nentries, o_u, o_MU, mesh.comm)); + dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_u, o_MU, mesh.comm)); if(mesh.rank==0) printf("Solution norm = %17.15lg\n", norm2); diff --git a/solvers/ins/src/insSettings.cpp b/solvers/ins/src/insSettings.cpp index 9010f13a6..ba63cdd2b 100644 --- a/solvers/ins/src/insSettings.cpp +++ b/solvers/ins/src/insSettings.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,7 +27,7 @@ SOFTWARE. #include "ins.hpp" //settings for ins solver -insSettings_t::insSettings_t(MPI_Comm& _comm): +insSettings_t::insSettings_t(comm_t& _comm): settings_t(_comm) { newSetting("DATA FILE", @@ -88,19 +88,16 @@ insSettings_t::insSettings_t(MPI_Comm& _comm): ellipticAddSettings(*this, "VELOCITY "); parAlmond::AddSettings(*this, "VELOCITY "); - initialGuessAddSettings(*this, "VELOCITY "); + InitialGuess::AddSettings(*this, "VELOCITY "); ellipticAddSettings(*this, "PRESSURE "); parAlmond::AddSettings(*this, "PRESSURE "); - initialGuessAddSettings(*this, "PRESSURE "); + InitialGuess::AddSettings(*this, "PRESSURE "); } void insSettings_t::report() { - int rank; - MPI_Comm_rank(comm, &rank); - - if (rank==0) { + if (comm.rank()==0) { std::cout << "INS Settings:\n\n"; reportSetting("DATA FILE"); reportSetting("VISCOSITY"); @@ -167,15 +164,15 @@ void insSettings_t::report() { void insSettings_t::parseFromFile(platformSettings_t& platformSettings, meshSettings_t& meshSettings, - const string filename) { + const std::string filename) { //read all settings from file settings_t s(comm); s.readSettingsFromFile(filename); for(auto it = s.settings.begin(); it != s.settings.end(); ++it) { - setting_t* set = it->second; - const string name = set->getName(); - const string val = set->getVal(); + setting_t& set = it->second; + const std::string name = set.getName(); + const std::string val = set.getVal(); if (platformSettings.hasSetting(name)) platformSettings.changeSetting(name, val); else if (meshSettings.hasSetting(name)) @@ -183,46 +180,44 @@ void insSettings_t::parseFromFile(platformSettings_t& platformSettings, else if (hasSetting(name)) //self changeSetting(name, val); else { - stringstream ss; - ss << "Unknown setting: [" << name << "] requested"; - LIBP_ABORT(ss.str()); + LIBP_FORCE_ABORT("Unknown setting: [" << name << "] requested"); } } } -ellipticSettings_t* insSettings_t::extractVelocitySettings() { +ellipticSettings_t insSettings_t::extractVelocitySettings() { - ellipticSettings_t* velocitySettings = new ellipticSettings_t(comm); + ellipticSettings_t velocitySettings(comm); - initialGuessAddSettings(*velocitySettings); + InitialGuess::AddSettings(velocitySettings); - for(auto it = velocitySettings->settings.begin(); it != velocitySettings->settings.end(); ++it) { - setting_t* set = it->second; - const string name = set->getName(); + for(auto it = velocitySettings.settings.begin(); it != velocitySettings.settings.end(); ++it) { + setting_t& set = it->second; + const std::string name = set.getName(); - string val; + std::string val; getSetting("VELOCITY "+name, val); - set->updateVal(val); + set.updateVal(val); } return velocitySettings; } -ellipticSettings_t* insSettings_t::extractPressureSettings() { +ellipticSettings_t insSettings_t::extractPressureSettings() { - ellipticSettings_t* pressureSettings = new ellipticSettings_t(comm); + ellipticSettings_t pressureSettings(comm); - initialGuessAddSettings(*pressureSettings); + InitialGuess::AddSettings(pressureSettings); - for(auto it = pressureSettings->settings.begin(); it != pressureSettings->settings.end(); ++it) { - setting_t* set = it->second; - const string name = set->getName(); + for(auto it = pressureSettings.settings.begin(); it != pressureSettings.settings.end(); ++it) { + setting_t& set = it->second; + const std::string name = set.getName(); - string val; + std::string val; getSetting("PRESSURE "+name, val); - set->updateVal(val); + set.updateVal(val); } return pressureSettings; diff --git a/solvers/ins/src/insSetup.cpp b/solvers/ins/src/insSetup.cpp index 5190e8ac1..c453105d3 100644 --- a/solvers/ins/src/insSetup.cpp +++ b/solvers/ins/src/insSetup.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,23 +26,29 @@ SOFTWARE. #include "ins.hpp" -ins_t& ins_t::Setup(platform_t& platform, mesh_t& mesh, - insSettings_t& settings){ +void ins_t::Setup(platform_t& _platform, mesh_t& _mesh, + insSettings_t& _settings){ - ins_t* ins = new ins_t(platform, mesh, settings); + platform = _platform; + mesh = _mesh; + comm = _mesh.comm; + settings = _settings; - ins->NVfields = (mesh.dim==3) ? 3:2; // Total Number of Velocity Fields - ins->NTfields = (mesh.dim==3) ? 4:3; // Total Velocity + Pressure + //Trigger JIT kernel builds + ogs::InitializeKernels(platform, ogs::Dfloat, ogs::Add); - settings.getSetting("VISCOSITY", ins->nu); + NVfields = (mesh.dim==3) ? 3:2; // Total Number of Velocity Fields + NTfields = (mesh.dim==3) ? 4:3; // Total Velocity + Pressure - ins->cubature = (settings.compareSetting("ADVECTION TYPE", "CUBATURE")) ? 1:0; - ins->pressureIncrement = (settings.compareSetting("PRESSURE INCREMENT", "TRUE")) ? 1:0; + settings.getSetting("VISCOSITY", nu); + + cubature = (settings.compareSetting("ADVECTION TYPE", "CUBATURE")) ? 1:0; + pressureIncrement = (settings.compareSetting("PRESSURE INCREMENT", "TRUE")) ? 1:0; //setup cubature - if (ins->cubature) { + if (cubature) { mesh.CubatureSetup(); - mesh.CubatureNodes(); + mesh.CubaturePhysicalNodes(); } dlong Nlocal = mesh.Nelements*mesh.Np; @@ -51,27 +57,22 @@ ins_t& ins_t::Setup(platform_t& platform, mesh_t& mesh, //setup timeStepper dfloat gamma = 0.0; if (settings.compareSetting("TIME INTEGRATOR","EXTBDF3")){ - ins->timeStepper = new TimeStepper::extbdf3(mesh.Nelements, mesh.totalHaloPairs, - mesh.Np, ins->NVfields, *ins); - gamma = ((TimeStepper::extbdf3*) ins->timeStepper)->getGamma(); + timeStepper.Setup(mesh.Nelements, + mesh.totalHaloPairs, + mesh.Np, NVfields, platform, comm); + gamma = timeStepper.GetGamma(); } else if (settings.compareSetting("TIME INTEGRATOR","SSBDF3")){ - ins->timeStepper = new TimeStepper::ssbdf3(mesh.Nelements, mesh.totalHaloPairs, - mesh.Np, ins->NVfields, *ins); - gamma = ((TimeStepper::ssbdf3*) ins->timeStepper)->getGamma(); + timeStepper.Setup(mesh.Nelements, + mesh.totalHaloPairs, + mesh.Np, NVfields, platform, comm); + gamma = timeStepper.GetGamma(); } - ins->Nsubcycles=1; + Nsubcycles=1; if (settings.compareSetting("TIME INTEGRATOR","SSBDF3")) - settings.getSetting("NUMBER OF SUBCYCLES", ins->Nsubcycles); + settings.getSetting("NUMBER OF SUBCYCLES", Nsubcycles); //Setup velocity Elliptic solvers - ins->uSolver=NULL; - ins->vSolver=NULL; - ins->wSolver=NULL; - ins->uLinearSolver=NULL; - ins->vLinearSolver=NULL; - ins->wLinearSolver=NULL; - dlong uNlocal=0, vNlocal=0, wNlocal=0; dlong uNhalo=0, vNhalo=0, wNhalo=0; if (settings.compareSetting("TIME INTEGRATOR","EXTBDF3") @@ -85,273 +86,404 @@ ins_t& ins_t::Setup(platform_t& platform, mesh_t& mesh, // bc = 5 -> y-aligned slip // bc = 6 -> z-aligned slip int NBCTypes = 7; - int uBCType[NBCTypes] = {0,1,1,2,1,2,2}; // bc=3 => outflow => Neumann => vBCType[3] = 2, etc. - int vBCType[NBCTypes] = {0,1,1,2,2,1,2}; // bc=3 => outflow => Neumann => vBCType[3] = 2, etc. - int wBCType[NBCTypes] = {0,1,1,2,2,2,1}; // bc=3 => outflow => Neumann => vBCType[3] = 2, etc. - - ins->vSettings = settings.extractVelocitySettings(); + memory uBCType(NBCTypes); + // bc=3 => outflow => Neumann => vBCType[3] = 2, etc. + uBCType[0] = 0; + uBCType[1] = 1; + uBCType[2] = 1; + uBCType[3] = 2; + uBCType[4] = 1; + uBCType[5] = 2; + uBCType[6] = 2; + + memory vBCType(NBCTypes); + // bc=3 => outflow => Neumann => vBCType[3] = 2, etc. + vBCType[0] = 0; + vBCType[1] = 1; + vBCType[2] = 1; + vBCType[3] = 2; + vBCType[4] = 2; + vBCType[5] = 1; + vBCType[6] = 2; + + memory wBCType(NBCTypes); + // bc=3 => outflow => Neumann => vBCType[3] = 2, etc. + wBCType[0] = 0; + wBCType[1] = 1; + wBCType[2] = 1; + wBCType[3] = 2; + wBCType[4] = 2; + wBCType[5] = 2; + wBCType[6] = 1; + + vSettings = _settings.extractVelocitySettings(); //make a guess at dt for the lambda value //TODO: we should allow preconditioners to be re-setup if lambda is updated dfloat hmin = mesh.MinCharacteristicLength(); - dfloat dtAdvc = ins->Nsubcycles*hmin/((mesh.N+1.)*(mesh.N+1.)); - dfloat lambda = gamma/(dtAdvc*ins->nu); - ins->uSolver = &(elliptic_t::Setup(platform, mesh, *(ins->vSettings), - lambda, NBCTypes, uBCType)); - ins->vSolver = &(elliptic_t::Setup(platform, mesh, *(ins->vSettings), - lambda, NBCTypes, vBCType)); - ins->wSolver = &(elliptic_t::Setup(platform, mesh, *(ins->vSettings), - lambda, NBCTypes, wBCType)); - ins->vTau = ins->uSolver->tau; - - ins->vDisc_c0 = settings.compareSetting("VELOCITY DISCRETIZATION", "CONTINUOUS") ? 1 : 0; - - uNlocal = ins->uSolver->Ndofs; - vNlocal = ins->vSolver->Ndofs; - if (mesh.dim == 3) wNlocal = ins->wSolver->Ndofs; - - uNhalo = ins->uSolver->Nhalo; - vNhalo = ins->vSolver->Nhalo; - if (mesh.dim == 3) wNhalo = ins->wSolver->Nhalo; - - ins->uLinearSolver = initialGuessSolver_t::Setup(uNlocal, uNhalo, - platform, *(ins->vSettings), mesh.comm); - - ins->vLinearSolver = initialGuessSolver_t::Setup(vNlocal, vNhalo, - platform, *(ins->vSettings), mesh.comm); - if (mesh.dim == 3) { - ins->wLinearSolver = initialGuessSolver_t::Setup(wNlocal, wNhalo, - platform, *(ins->vSettings), mesh.comm); + dfloat dtAdvc = Nsubcycles*hmin/((mesh.N+1.)*(mesh.N+1.)); + dfloat lambda = gamma/(dtAdvc*nu); + uSolver.Setup(platform, mesh, vSettings, + lambda, NBCTypes, uBCType); + vSolver.Setup(platform, mesh, vSettings, + lambda, NBCTypes, vBCType); + if (mesh.dim == 3) + wSolver.Setup(platform, mesh, vSettings, + lambda, NBCTypes, wBCType); + + vTau = uSolver.tau; + + vDisc_c0 = settings.compareSetting("VELOCITY DISCRETIZATION", "CONTINUOUS") ? 1 : 0; + + uNlocal = uSolver.Ndofs; + vNlocal = vSolver.Ndofs; + if (mesh.dim == 3) wNlocal = wSolver.Ndofs; + + uNhalo = uSolver.Nhalo; + vNhalo = vSolver.Nhalo; + if (mesh.dim == 3) wNhalo = wSolver.Nhalo; + + if (vSettings.compareSetting("LINEAR SOLVER","NBPCG")){ + + uLinearSolver.Setup(uNlocal, uNhalo, platform, vSettings, comm); + vLinearSolver.Setup(vNlocal, vNhalo, platform, vSettings, comm); + if (mesh.dim==3) + wLinearSolver.Setup(wNlocal, wNhalo, platform, vSettings, comm); + + } else if (vSettings.compareSetting("LINEAR SOLVER","NBFPCG")){ + + uLinearSolver.Setup(uNlocal, uNhalo, platform, vSettings, comm); + vLinearSolver.Setup(vNlocal, vNhalo, platform, vSettings, comm); + if (mesh.dim==3) + wLinearSolver.Setup(wNlocal, wNhalo, platform, vSettings, comm); + + } else if (vSettings.compareSetting("LINEAR SOLVER","PCG")){ + + uLinearSolver.Setup(uNlocal, uNhalo, platform, vSettings, comm); + vLinearSolver.Setup(vNlocal, vNhalo, platform, vSettings, comm); + if (mesh.dim==3) + wLinearSolver.Setup(wNlocal, wNhalo, platform, vSettings, comm); + + } else if (vSettings.compareSetting("LINEAR SOLVER","PGMRES")){ + + uLinearSolver.Setup(uNlocal, uNhalo, platform, vSettings, comm); + vLinearSolver.Setup(vNlocal, vNhalo, platform, vSettings, comm); + if (mesh.dim==3) + wLinearSolver.Setup(wNlocal, wNhalo, platform, vSettings, comm); + + } else if (vSettings.compareSetting("LINEAR SOLVER","PMINRES")){ + + uLinearSolver.Setup(uNlocal, uNhalo, platform, vSettings, comm); + vLinearSolver.Setup(vNlocal, vNhalo, platform, vSettings, comm); + if (mesh.dim==3) + wLinearSolver.Setup(wNlocal, wNhalo, platform, vSettings, comm); + } + + if (vSettings.compareSetting("INITIAL GUESS STRATEGY", "NONE")) { + + uLinearSolver.SetupInitialGuess(uNlocal, platform, vSettings, comm); + vLinearSolver.SetupInitialGuess(vNlocal, platform, vSettings, comm); + if (mesh.dim==3) + wLinearSolver.SetupInitialGuess(wNlocal, platform, vSettings, comm); + + } else if (vSettings.compareSetting("INITIAL GUESS STRATEGY", "ZERO")) { + + uLinearSolver.SetupInitialGuess(uNlocal, platform, vSettings, comm); + vLinearSolver.SetupInitialGuess(vNlocal, platform, vSettings, comm); + if (mesh.dim==3) + wLinearSolver.SetupInitialGuess(wNlocal, platform, vSettings, comm); + + } else if (vSettings.compareSetting("INITIAL GUESS STRATEGY", "CLASSIC")) { + + uLinearSolver.SetupInitialGuess(uNlocal, platform, vSettings, comm); + vLinearSolver.SetupInitialGuess(vNlocal, platform, vSettings, comm); + if (mesh.dim==3) + wLinearSolver.SetupInitialGuess(wNlocal, platform, vSettings, comm); + + } else if (vSettings.compareSetting("INITIAL GUESS STRATEGY", "QR")) { + + uLinearSolver.SetupInitialGuess(uNlocal, platform, vSettings, comm); + vLinearSolver.SetupInitialGuess(vNlocal, platform, vSettings, comm); + if (mesh.dim==3) + wLinearSolver.SetupInitialGuess(wNlocal, platform, vSettings, comm); + + } else if (vSettings.compareSetting("INITIAL GUESS STRATEGY", "EXTRAP")) { + + uLinearSolver.SetupInitialGuess(uNlocal, platform, vSettings, comm); + vLinearSolver.SetupInitialGuess(vNlocal, platform, vSettings, comm); + if (mesh.dim==3) + wLinearSolver.SetupInitialGuess(wNlocal, platform, vSettings, comm); + } } else { - ins->vDisc_c0 = 0; + vDisc_c0 = 0; //set penalty - if (mesh.elementType==TRIANGLES || - mesh.elementType==QUADRILATERALS){ - ins->vTau = 2.0*(mesh.N+1)*(mesh.N+2)/2.0; + if (mesh.elementType==Mesh::TRIANGLES || + mesh.elementType==Mesh::QUADRILATERALS){ + vTau = 2.0*(mesh.N+1)*(mesh.N+2)/2.0; if(mesh.dim==3) - ins->vTau *= 1.5; + vTau *= 1.5; } else - ins->vTau = 2.0*(mesh.N+1)*(mesh.N+3); + vTau = 2.0*(mesh.N+1)*(mesh.N+3); } //Setup pressure Elliptic solver dlong pNlocal=0, pNhalo=0; { int NBCTypes = 7; - int pBCType[NBCTypes] = {0,2,2,1,2,2,2}; // bc=3 => outflow => Dirichlet => pBCType[3] = 1, etc. - - ins->pSettings = settings.extractPressureSettings(); - ins->pSolver = &(elliptic_t::Setup(platform, mesh, *(ins->pSettings), - 0.0, NBCTypes, pBCType)); - ins->pTau = ins->pSolver->tau; - - ins->pDisc_c0 = settings.compareSetting("PRESSURE DISCRETIZATION", "CONTINUOUS") ? 1 : 0; - - if (ins->pDisc_c0) { - pNlocal = ins->pSolver->ogsMasked->Ngather; - pNhalo = ins->pSolver->ogsMasked->NgatherHalo; + memory pBCType(NBCTypes); + // bc=3 => outflow => Dirichlet => pBCType[3] = 1, etc. + pBCType[0] = 0; + pBCType[1] = 2; + pBCType[2] = 2; + pBCType[3] = 1; + pBCType[4] = 2; + pBCType[5] = 2; + pBCType[6] = 2; + + pSettings = _settings.extractPressureSettings(); + pSolver.Setup(platform, mesh, pSettings, + 0.0, NBCTypes, pBCType); + pTau = pSolver.tau; + + pDisc_c0 = settings.compareSetting("PRESSURE DISCRETIZATION", "CONTINUOUS") ? 1 : 0; + + if (pDisc_c0) { + pNlocal = pSolver.ogsMasked.Ngather; + pNhalo = pSolver.gHalo.Nhalo; } else { pNlocal = mesh.Nelements*mesh.Np; pNhalo = mesh.totalHaloPairs*mesh.Np; } - ins->pLinearSolver = initialGuessSolver_t::Setup(pNlocal, pNhalo, - platform, *(ins->pSettings), mesh.comm); + if (vSettings.compareSetting("LINEAR SOLVER","NBPCG")){ + pLinearSolver.Setup(pNlocal, pNhalo, platform, pSettings, comm); + } else if (pSettings.compareSetting("LINEAR SOLVER","NBFPCG")){ + pLinearSolver.Setup(pNlocal, pNhalo, platform, pSettings, comm); + } else if (pSettings.compareSetting("LINEAR SOLVER","PCG")){ + pLinearSolver.Setup(pNlocal, pNhalo, platform, pSettings, comm); + } else if (pSettings.compareSetting("LINEAR SOLVER","PGMRES")){ + pLinearSolver.Setup(pNlocal, pNhalo, platform, pSettings, comm); + } else if (pSettings.compareSetting("LINEAR SOLVER","PMINRES")){ + pLinearSolver.Setup(pNlocal, pNhalo, platform, pSettings, comm); + } + + if (pSettings.compareSetting("INITIAL GUESS STRATEGY", "NONE")) { + pLinearSolver.SetupInitialGuess(pNlocal, platform, pSettings, comm); + } else if (pSettings.compareSetting("INITIAL GUESS STRATEGY", "ZERO")) { + pLinearSolver.SetupInitialGuess(pNlocal, platform, pSettings, comm); + } else if (pSettings.compareSetting("INITIAL GUESS STRATEGY", "CLASSIC")) { + pLinearSolver.SetupInitialGuess(pNlocal, platform, pSettings, comm); + } else if (pSettings.compareSetting("INITIAL GUESS STRATEGY", "QR")) { + pLinearSolver.SetupInitialGuess(pNlocal, platform, pSettings, comm); + } else if (pSettings.compareSetting("INITIAL GUESS STRATEGY", "EXTRAP")) { + pLinearSolver.SetupInitialGuess(pNlocal, platform, pSettings, comm); + } } //Solver tolerances - ins->presTOL = 1E-8; - ins->velTOL = 1E-8; - - //build node-wise boundary flag - ins->BoundarySetup(); + if (sizeof(dfloat)==sizeof(double)) { + presTOL = 1.0E-8; + velTOL = 1.0E-8; + } else { + presTOL = 1.0E-5; + velTOL = 1.0E-5; + } //setup linear algebra module - platform.linAlg.InitKernels({"innerProd", "axpy", "max"}); + platform.linAlg().InitKernels({"innerProd", "axpy", "max"}); /*setup trace halo exchange */ - ins->pTraceHalo = mesh.HaloTraceSetup(1); //one field - ins->vTraceHalo = mesh.HaloTraceSetup(ins->NVfields); //one field + pTraceHalo = mesh.HaloTraceSetup(1); //one field + vTraceHalo = mesh.HaloTraceSetup(NVfields); //one field // u and p at interpolation nodes - ins->u = (dfloat*) calloc((Nlocal+Nhalo)*ins->NVfields, sizeof(dfloat)); - ins->o_u = platform.malloc((Nlocal+Nhalo)*ins->NVfields*sizeof(dfloat), ins->u); + u.malloc((Nlocal+Nhalo)*NVfields, 0.0); + o_u = platform.malloc(u); - ins->p = (dfloat*) calloc(Nlocal+Nhalo, sizeof(dfloat)); - ins->o_p = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), ins->p); + p.malloc(Nlocal+Nhalo, 0.0); + o_p = platform.malloc(p); //storage for velocity gradient if ( !settings.compareSetting("TIME INTEGRATOR","EXTBDF3") && !settings.compareSetting("TIME INTEGRATOR","SSBDF3")) - ins->o_GU = platform.malloc((Nlocal+Nhalo)*4*sizeof(dfloat)); + o_GU = platform.malloc((Nlocal+Nhalo)*4); //extra buffers for solvers if (settings.compareSetting("TIME INTEGRATOR","EXTBDF3") ||settings.compareSetting("TIME INTEGRATOR","SSBDF3")) { - ins->o_UH = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat)); - ins->o_VH = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat)); + o_UH = platform.malloc(Nlocal+Nhalo, u); + o_VH = platform.malloc(Nlocal+Nhalo, u); if (mesh.dim==3) - ins->o_WH = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat)); - else - ins->o_WH = platform.malloc((1)*sizeof(dfloat)); + o_WH = platform.malloc(Nlocal+Nhalo, u); - ins->o_rhsU = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat)); - ins->o_rhsV = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat)); + o_rhsU = platform.malloc(Nlocal+Nhalo, u); + o_rhsV = platform.malloc(Nlocal+Nhalo, u); if (mesh.dim==3) - ins->o_rhsW = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat)); - else - ins->o_rhsW = platform.malloc((1)*sizeof(dfloat)); + o_rhsW = platform.malloc(Nlocal+Nhalo, u); - if (ins->vDisc_c0) { - ins->o_GUH = platform.malloc((uNlocal+uNhalo)*sizeof(dfloat), ins->u); - ins->o_GVH = platform.malloc((vNlocal+vNhalo)*sizeof(dfloat), ins->u); + if (vDisc_c0) { + o_GUH = platform.malloc(uNlocal+uNhalo, u); + o_GVH = platform.malloc(vNlocal+vNhalo, u); if (mesh.dim==3) - ins->o_GWH = platform.malloc((wNlocal+wNhalo)*sizeof(dfloat), ins->u); + o_GWH = platform.malloc(wNlocal+wNhalo, u); - ins->o_GrhsU = platform.malloc((uNlocal+uNhalo)*sizeof(dfloat)); - ins->o_GrhsV = platform.malloc((vNlocal+vNhalo)*sizeof(dfloat)); + o_GrhsU = platform.malloc(uNlocal+uNhalo, u); + o_GrhsV = platform.malloc(vNlocal+vNhalo, u); if (mesh.dim==3) - ins->o_GrhsW = platform.malloc((wNlocal+wNhalo)*sizeof(dfloat)); + o_GrhsW = platform.malloc(wNlocal+wNhalo, u); } } - if (ins->pressureIncrement) { - ins->o_PI = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), ins->p); - ins->o_GPI = platform.malloc((pNlocal+pNhalo)*sizeof(dfloat), ins->p); + if (pressureIncrement) { + o_PI = platform.malloc(p); + o_GPI = platform.malloc(p); } - ins->o_rhsP = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat)); - if (ins->pDisc_c0) { - ins->o_GP = platform.malloc((pNlocal+pNhalo)*sizeof(dfloat), ins->p); - ins->o_GrhsP = platform.malloc((pNlocal+pNhalo)*sizeof(dfloat)); + o_rhsP = platform.malloc(p); + if (pDisc_c0) { + o_GP = platform.malloc(p); + o_GrhsP = platform.malloc(p); } //storage for M*u during reporting - ins->o_MU = platform.malloc((Nlocal+Nhalo)*ins->NVfields*sizeof(dfloat), ins->u); - mesh.MassMatrixKernelSetup(ins->NVfields); // mass matrix operator + o_MU = platform.malloc(u); + mesh.MassMatrixKernelSetup(NVfields); // mass matrix operator if (mesh.dim==2) { - ins->Vort = (dfloat*) calloc((Nlocal+Nhalo), sizeof(dfloat)); - ins->o_Vort = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), ins->Vort); + Vort.malloc(Nlocal+Nhalo, 0.0); + o_Vort = platform.malloc(Vort); } else { - ins->Vort = (dfloat*) calloc((Nlocal+Nhalo)*ins->NVfields, sizeof(dfloat)); - ins->o_Vort = platform.malloc((Nlocal+Nhalo)*ins->NVfields*sizeof(dfloat), ins->Vort); + Vort.malloc((Nlocal+Nhalo)*NVfields, 0.0); + o_Vort = platform.malloc(Vort); } // OCCA build stuff - occa::properties kernelInfo = mesh.props; //copy base occa properties + properties_t kernelInfo = mesh.props; //copy base occa properties //add boundary data to kernel info - string dataFileName; + std::string dataFileName; settings.getSetting("DATA FILE", dataFileName); kernelInfo["includes"] += dataFileName; - kernelInfo["defines/" "p_Nfields"] = ins->NVfields; - kernelInfo["defines/" "p_NVfields"]= ins->NVfields; - kernelInfo["defines/" "p_NTfields"]= ins->NTfields; + kernelInfo["defines/" "p_Nfields"] = NVfields; + kernelInfo["defines/" "p_NVfields"]= NVfields; + kernelInfo["defines/" "p_NTfields"]= NTfields; - int maxNodes = mymax(mesh.Np, (mesh.Nfp*mesh.Nfaces)); + int maxNodes = std::max(mesh.Np, (mesh.Nfp*mesh.Nfaces)); kernelInfo["defines/" "p_maxNodes"]= maxNodes; int blockMax = 256; - int NblockV = mymax(1,blockMax/mesh.Np); + int NblockV = std::max(1,blockMax/mesh.Np); kernelInfo["defines/" "p_NblockV"]= NblockV; - int NblockS = mymax(1,blockMax/maxNodes); + int NblockS = std::max(1,blockMax/maxNodes); kernelInfo["defines/" "p_NblockS"]= NblockS; - if (ins->cubature) { - int cubMaxNodes = mymax(mesh.Np, (mesh.intNfp*mesh.Nfaces)); + if (cubature) { + int cubMaxNodes = std::max(mesh.Np, (mesh.intNfp*mesh.Nfaces)); kernelInfo["defines/" "p_cubMaxNodes"]= cubMaxNodes; - int cubMaxNodes1 = mymax(mesh.Np, (mesh.intNfp)); + int cubMaxNodes1 = std::max(mesh.Np, (mesh.intNfp)); kernelInfo["defines/" "p_cubMaxNodes1"]= cubMaxNodes1; - int cubNblockV = mymax(1,blockMax/mesh.cubNp); + int cubNblockV = std::max(1,blockMax/mesh.cubNp); kernelInfo["defines/" "p_cubNblockV"]= cubNblockV; - int cubNblockS = mymax(1,blockMax/cubMaxNodes); + int cubNblockS = std::max(1,blockMax/cubMaxNodes); kernelInfo["defines/" "p_cubNblockS"]= cubNblockS; } - kernelInfo["parser/" "automate-add-barriers"] = "disabled"; - // set kernel name suffix - char *suffix; - if(mesh.elementType==TRIANGLES) - suffix = strdup("Tri2D"); - if(mesh.elementType==QUADRILATERALS) - suffix = strdup("Quad2D"); - if(mesh.elementType==TETRAHEDRA) - suffix = strdup("Tet3D"); - if(mesh.elementType==HEXAHEDRA) - suffix = strdup("Hex3D"); - - char fileName[BUFSIZ], kernelName[BUFSIZ]; + std::string suffix; + if(mesh.elementType==Mesh::TRIANGLES) + suffix = "Tri2D"; + if(mesh.elementType==Mesh::QUADRILATERALS) + suffix = "Quad2D"; + if(mesh.elementType==Mesh::TETRAHEDRA) + suffix = "Tet3D"; + if(mesh.elementType==Mesh::HEXAHEDRA) + suffix = "Hex3D"; + + std::string oklFilePrefix = DINS "/okl/"; + std::string oklFileSuffix = ".okl"; + + std::string fileName, kernelName; // advection kernels - ins->subcycler=NULL; - ins->subStepper=NULL; if (settings.compareSetting("TIME INTEGRATOR","SSBDF3")) { //subcycle kernels - if (ins->cubature) { - sprintf(fileName, DINS "/okl/insSubcycleCubatureAdvection%s.okl", suffix); - sprintf(kernelName, "insSubcycleAdvectionCubatureVolume%s", suffix); - ins->advectionVolumeKernel = platform.buildKernel(fileName, kernelName, + if (cubature) { + fileName = oklFilePrefix + "insSubcycleCubatureAdvection" + suffix + oklFileSuffix; + kernelName = "insSubcycleAdvectionCubatureVolume" + suffix; + advectionVolumeKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(kernelName, "insSubcycleAdvectionCubatureSurface%s", suffix); - ins->advectionSurfaceKernel = platform.buildKernel(fileName, kernelName, + kernelName = "insSubcycleAdvectionCubatureSurface" + suffix; + advectionSurfaceKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } else { - sprintf(fileName, DINS "/okl/insSubcycleAdvection%s.okl", suffix); - sprintf(kernelName, "insSubcycleAdvectionVolume%s", suffix); - ins->advectionVolumeKernel = platform.buildKernel(fileName, kernelName, + fileName = oklFilePrefix + "insSubcycleAdvection" + suffix + oklFileSuffix; + kernelName = "insSubcycleAdvectionVolume" + suffix; + advectionVolumeKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(kernelName, "insSubcycleAdvectionSurface%s", suffix); - ins->advectionSurfaceKernel = platform.buildKernel(fileName, kernelName, + kernelName = "insSubcycleAdvectionSurface" + suffix; + advectionSurfaceKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } //build subcycler - ins->subcycler = new subcycler_t(*ins); + subcycler.platform = platform; + subcycler.mesh = mesh; + subcycler.comm = comm; + subcycler.settings = settings; + + subcycler.NVfields = NVfields; + subcycler.nu = nu; + subcycler.cubature = cubature; + subcycler.vTraceHalo = vTraceHalo; + subcycler.advectionVolumeKernel = advectionVolumeKernel; + subcycler.advectionSurfaceKernel = advectionSurfaceKernel; + if (settings.compareSetting("SUBCYCLING TIME INTEGRATOR","AB3")){ - ins->subStepper = new TimeStepper::ab3(mesh.Nelements, mesh.totalHaloPairs, - mesh.Np, ins->NVfields, *(ins->subcycler)); + subStepper.Setup(mesh.Nelements, + mesh.totalHaloPairs, + mesh.Np, NVfields, platform, comm); } else if (settings.compareSetting("SUBCYCLING TIME INTEGRATOR","LSERK4")){ - ins->subStepper = new TimeStepper::lserk4(mesh.Nelements, mesh.totalHaloPairs, - mesh.Np, ins->NVfields, *(ins->subcycler)); + subStepper.Setup(mesh.Nelements, + mesh.totalHaloPairs, + mesh.Np, NVfields, platform, comm); } else if (settings.compareSetting("SUBCYCLING TIME INTEGRATOR","DOPRI5")){ - ins->subStepper = new TimeStepper::dopri5(mesh.Nelements, mesh.totalHaloPairs, - mesh.Np, ins->NVfields, *(ins->subcycler), mesh.comm); + subStepper.Setup(mesh.Nelements, + mesh.totalHaloPairs, + mesh.Np, NVfields, platform, comm); } - sprintf(fileName, DINS "/okl/insSubcycleAdvection.okl"); - sprintf(kernelName, "insSubcycleAdvectionKernel"); - ins->subcycler->subCycleAdvectionKernel = platform.buildKernel(fileName, kernelName, + fileName = oklFilePrefix + "insSubcycleAdvection" + oklFileSuffix; + kernelName = "insSubcycleAdvectionKernel"; + subcycler.subCycleAdvectionKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - ins->subcycler->o_Ue = platform.malloc((Nlocal+Nhalo)*ins->NVfields*sizeof(dfloat), ins->u); + subcycler.o_Ue = platform.malloc(u); } else { //regular advection kernels - if (ins->cubature) { - sprintf(fileName, DINS "/okl/insCubatureAdvection%s.okl", suffix); - sprintf(kernelName, "insAdvectionCubatureVolume%s", suffix); - ins->advectionVolumeKernel = platform.buildKernel(fileName, kernelName, + if (cubature) { + fileName = oklFilePrefix + "insCubatureAdvection" + suffix + oklFileSuffix; + kernelName = "insAdvectionCubatureVolume" + suffix; + advectionVolumeKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(kernelName, "insAdvectionCubatureSurface%s", suffix); - ins->advectionSurfaceKernel = platform.buildKernel(fileName, kernelName, + kernelName = "insAdvectionCubatureSurface" + suffix; + advectionSurfaceKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } else { - sprintf(fileName, DINS "/okl/insAdvection%s.okl", suffix); - sprintf(kernelName, "insAdvectionVolume%s", suffix); - ins->advectionVolumeKernel = platform.buildKernel(fileName, kernelName, + fileName = oklFilePrefix + "insAdvection" + suffix + oklFileSuffix; + kernelName = "insAdvectionVolume" + suffix; + advectionVolumeKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(kernelName, "insAdvectionSurface%s", suffix); - ins->advectionSurfaceKernel = platform.buildKernel(fileName, kernelName, + kernelName = "insAdvectionSurface" + suffix; + advectionSurfaceKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } } @@ -359,133 +491,95 @@ ins_t& ins_t::Setup(platform_t& platform, mesh_t& mesh, // diffusion kernels if (settings.compareSetting("TIME INTEGRATOR","EXTBDF3") ||settings.compareSetting("TIME INTEGRATOR","SSBDF3")) { - sprintf(fileName, DINS "/okl/insVelocityRhs%s.okl", suffix); + fileName = oklFilePrefix + "insVelocityRhs" + suffix + oklFileSuffix; - if (ins->vDisc_c0) - sprintf(kernelName, "insVelocityRhs%s", suffix); + if (vDisc_c0) + kernelName = "insVelocityRhs" + suffix; else - sprintf(kernelName, "insVelocityIpdgRhs%s", suffix); - ins->velocityRhsKernel = platform.buildKernel(fileName, kernelName, + kernelName = "insVelocityIpdgRhs" + suffix; + velocityRhsKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(kernelName, "insVelocityBC%s", suffix); - ins->velocityBCKernel = platform.buildKernel(fileName, kernelName, + kernelName = "insVelocityBC" + suffix; + velocityBCKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } else { // gradient kernel - sprintf(fileName, DINS "/okl/insVelocityGradient%s.okl", suffix); - sprintf(kernelName, "insVelocityGradient%s", suffix); - ins->velocityGradientKernel = platform.buildKernel(fileName, kernelName, + fileName = oklFilePrefix + "insVelocityGradient" + suffix + oklFileSuffix; + kernelName = "insVelocityGradient" + suffix; + velocityGradientKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(fileName, DINS "/okl/insDiffusion%s.okl", suffix); - sprintf(kernelName, "insDiffusion%s", suffix); - ins->diffusionKernel = platform.buildKernel(fileName, kernelName, + fileName = oklFilePrefix + "insDiffusion" + suffix + oklFileSuffix; + kernelName = "insDiffusion" + suffix; + diffusionKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } //pressure gradient kernels - sprintf(fileName, DINS "/okl/insGradient%s.okl", suffix); - sprintf(kernelName, "insGradientVolume%s", suffix); - ins->gradientVolumeKernel = platform.buildKernel(fileName, kernelName, + fileName = oklFilePrefix + "insGradient" + suffix + oklFileSuffix; + kernelName = "insGradientVolume" + suffix; + gradientVolumeKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(kernelName, "insGradientSurface%s", suffix); - ins->gradientSurfaceKernel = platform.buildKernel(fileName, kernelName, + kernelName = "insGradientSurface" + suffix; + gradientSurfaceKernel = platform.buildKernel(fileName, kernelName, kernelInfo); //velocity divergence kernels - sprintf(fileName, DINS "/okl/insDivergence%s.okl", suffix); - sprintf(kernelName, "insDivergenceVolume%s", suffix); - ins->divergenceVolumeKernel = platform.buildKernel(fileName, kernelName, + fileName = oklFilePrefix + "insDivergence" + suffix + oklFileSuffix; + kernelName = "insDivergenceVolume" + suffix; + divergenceVolumeKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(kernelName, "insDivergenceSurface%s", suffix); - ins->divergenceSurfaceKernel = platform.buildKernel(fileName, kernelName, + kernelName = "insDivergenceSurface" + suffix; + divergenceSurfaceKernel = platform.buildKernel(fileName, kernelName, kernelInfo); //pressure solver kernels - if (ins->pressureIncrement) { - sprintf(fileName, DINS "/okl/insPressureIncrementRhs%s.okl", suffix); + if (pressureIncrement) { + fileName = oklFilePrefix + "insPressureIncrementRhs" + suffix + oklFileSuffix; - if (ins->pDisc_c0) - sprintf(kernelName, "insPressureIncrementRhs%s", suffix); + if (pDisc_c0) + kernelName = "insPressureIncrementRhs" + suffix; else - sprintf(kernelName, "insPressureIncrementIpdgRhs%s", suffix); - ins->pressureIncrementRhsKernel = platform.buildKernel(fileName, kernelName, + kernelName = "insPressureIncrementIpdgRhs" + suffix; + pressureIncrementRhsKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(kernelName, "insPressureIncrementBC%s", suffix); - ins->pressureIncrementBCKernel = platform.buildKernel(fileName, kernelName, + kernelName = "insPressureIncrementBC" + suffix; + pressureIncrementBCKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } else { - sprintf(fileName, DINS "/okl/insPressureRhs%s.okl", suffix); - if (ins->pDisc_c0) - sprintf(kernelName, "insPressureRhs%s", suffix); + fileName = oklFilePrefix + "insPressureRhs" + suffix + oklFileSuffix; + if (pDisc_c0) + kernelName = "insPressureRhs" + suffix; else - sprintf(kernelName, "insPressureIpdgRhs%s", suffix); - ins->pressureRhsKernel = platform.buildKernel(fileName, kernelName, + kernelName = "insPressureIpdgRhs" + suffix; + pressureRhsKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(kernelName, "insPressureBC%s", suffix); - ins->pressureBCKernel = platform.buildKernel(fileName, kernelName, + kernelName = "insPressureBC" + suffix; + pressureBCKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } - sprintf(fileName, DINS "/okl/insVorticity%s.okl", suffix); - sprintf(kernelName, "insVorticity%s", suffix); - ins->vorticityKernel = platform.buildKernel(fileName, kernelName, + fileName = oklFilePrefix + "insVorticity" + suffix + oklFileSuffix; + kernelName = "insVorticity" + suffix; + vorticityKernel = platform.buildKernel(fileName, kernelName, kernelInfo); if (mesh.dim==2) { - sprintf(fileName, DINS "/okl/insInitialCondition2D.okl"); - sprintf(kernelName, "insInitialCondition2D"); + fileName = oklFilePrefix + "insInitialCondition2D" + oklFileSuffix; + kernelName = "insInitialCondition2D"; } else { - sprintf(fileName, DINS "/okl/insInitialCondition3D.okl"); - sprintf(kernelName, "insInitialCondition3D"); + fileName = oklFilePrefix + "insInitialCondition3D" + oklFileSuffix; + kernelName = "insInitialCondition3D"; } - ins->initialConditionKernel = platform.buildKernel(fileName, kernelName, + initialConditionKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(fileName, DINS "/okl/insMaxWaveSpeed%s.okl", suffix); - sprintf(kernelName, "insMaxWaveSpeed%s", suffix); - - ins->maxWaveSpeedKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - - return *ins; -} - -ins_t::~ins_t() { - advectionVolumeKernel.free(); - advectionSurfaceKernel.free(); - divergenceVolumeKernel.free(); - divergenceSurfaceKernel.free(); - gradientVolumeKernel.free(); - gradientSurfaceKernel.free(); - velocityGradientKernel.free(); - diffusionKernel.free(); - velocityRhsKernel.free(); - velocityBCKernel.free(); - pressureRhsKernel.free(); - pressureBCKernel.free(); - vorticityKernel.free(); - initialConditionKernel.free(); - maxWaveSpeedKernel.free(); - - if (pSolver) delete pSolver; - if (uSolver) delete uSolver; - if (vSolver) delete vSolver; - if (wSolver) delete wSolver; - if (timeStepper) delete timeStepper; - if (pLinearSolver) delete pLinearSolver; - if (uLinearSolver) delete uLinearSolver; - if (vLinearSolver) delete vLinearSolver; - if (wLinearSolver) delete wLinearSolver; - if (subStepper) delete subStepper; - if (subcycler) { - subcycler->subCycleAdvectionKernel.free(); - delete subcycler; - } + fileName = oklFilePrefix + "insMaxWaveSpeed" + suffix + oklFileSuffix; + kernelName = "insMaxWaveSpeed" + suffix; - if (vTraceHalo) vTraceHalo->Free(); - if (pTraceHalo) pTraceHalo->Free(); + maxWaveSpeedKernel = platform.buildKernel(fileName, kernelName, kernelInfo); } diff --git a/solvers/ins/src/insStep.cpp b/solvers/ins/src/insStep.cpp index 2f9f999e3..81790b843 100644 --- a/solvers/ins/src/insStep.cpp +++ b/solvers/ins/src/insStep.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,10 +26,10 @@ SOFTWARE. #include "ins.hpp" -dfloat ins_t::MaxWaveSpeed(occa::memory& o_U, const dfloat T){ +dfloat ins_t::MaxWaveSpeed(deviceMemory& o_U, const dfloat T){ //Note: if this is on the critical path in the future, we should pre-allocate this - occa::memory o_maxSpeed = platform.malloc(mesh.Nelements*sizeof(dfloat)); + deviceMemory o_maxSpeed = platform.malloc(mesh.Nelements); maxWaveSpeedKernel(mesh.Nelements, mesh.o_vgeo, @@ -44,18 +44,17 @@ dfloat ins_t::MaxWaveSpeed(occa::memory& o_U, const dfloat T){ o_U, o_maxSpeed); - const dfloat vmax = platform.linAlg.max(mesh.Nelements, o_maxSpeed, mesh.comm); + const dfloat vmax = platform.linAlg().max(mesh.Nelements, o_maxSpeed, mesh.comm); - o_maxSpeed.free(); return vmax; } // Inversion of diffusion operator // Solves gamma*U - mu*Laplacian*U = rhs // Afterwards, imposes incompressiblity via pressure problem -void ins_t::rhs_imex_invg(occa::memory& o_RHS, occa::memory& o_U, const dfloat gamma, const dfloat T){ +void ins_t::rhs_imex_invg(deviceMemory& o_RHS, deviceMemory& o_U, const dfloat gamma, const dfloat T){ - const dfloat dt = timeStepper->GetTimeStep(); + const dfloat dt = timeStepper.GetTimeStep(); if (pressureIncrement) { //use current pressure in velocity RHS @@ -108,28 +107,27 @@ void ins_t::rhs_imex_invg(occa::memory& o_RHS, occa::memory& o_U, const dfloat g } // Evaluation of rhs f function -void ins_t::rhs_imex_f(occa::memory& o_U, occa::memory& o_RHS, const dfloat T){ +void ins_t::rhs_imex_f(deviceMemory& o_U, deviceMemory& o_RHS, const dfloat T){ // RHS = N(U) Advection(1.0, o_U, 0.0, o_RHS, T); } // Evolve rhs f function via a sub-timestepper -void ins_t::rhs_subcycle_f(occa::memory& o_U, occa::memory& o_UHAT, - const dfloat T, const dfloat dt, const dfloat* B, +void ins_t::rhs_subcycle_f(deviceMemory& o_U, deviceMemory& o_UHAT, + const dfloat T, const dfloat dt, const memory B, const int order, const int shiftIndex, const int maxOrder) { //subcycle each Lagrangian state qhat by stepping dqhat/dt = F(qhat,t) + LIBP_ABORT("Subcycling supports only order 3 interpolation for now.", + order>=3); - if (order>=3) - LIBP_ABORT("Subcycling supports only order 3 interpolation for now.") + subcycler.order = order; + subcycler.maxOrder = maxOrder; + subcycler.shiftIndex = shiftIndex; + subcycler.T0 = T; + subcycler.dt = dt; - subcycler->order = order; - subcycler->maxOrder = maxOrder; - subcycler->shiftIndex = shiftIndex; - subcycler->T0 = T; - subcycler->dt = dt; - - subcycler->o_Uh = o_U; //history + subcycler.o_Uh = o_U; //history //At each iteration of n, we step the partial sum // sum_i=n^order B[i]*U(t-i*dt) from t-n*dt to t-(n-1)*dt @@ -142,13 +140,13 @@ void ins_t::rhs_subcycle_f(occa::memory& o_U, occa::memory& o_UHAT, for (int n=order;n>=0;n--) { //for each history state, starting with oldest //q at t-n*dt - occa::memory o_Un = o_U + ((shiftIndex+n)%maxOrder)*N*sizeof(dfloat); + deviceMemory o_Un = o_U + ((shiftIndex+n)%maxOrder)*N; //next scaled partial sum - linAlg.axpy(N, B[n+1]/(B[n+1]+bSum), o_Un, - bSum/(B[n+1]+bSum), o_UHAT); + platform.linAlg().axpy(N, B[n+1]/(B[n+1]+bSum), o_Un, + bSum/(B[n+1]+bSum), o_UHAT); bSum += B[n+1]; - subStepper->Run(o_UHAT, T-n*dt, T-(n-1)*dt); + subStepper.Run(subcycler, o_UHAT, T-n*dt, T-(n-1)*dt); } } diff --git a/solvers/ins/src/insSubcycle.cpp b/solvers/ins/src/insSubcycle.cpp index 1fd62171b..b51a2e06c 100644 --- a/solvers/ins/src/insSubcycle.cpp +++ b/solvers/ins/src/insSubcycle.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -26,19 +26,8 @@ SOFTWARE. #include "ins.hpp" -subcycler_t::subcycler_t(ins_t& ins): - solver_t(ins.platform, ins.settings), mesh(ins.mesh) { - - NVfields = ins.NVfields; - nu = ins.nu; - cubature = ins.cubature; - vTraceHalo = ins.vTraceHalo; - advectionVolumeKernel = ins.advectionVolumeKernel; - advectionSurfaceKernel = ins.advectionSurfaceKernel; -} - //evaluate ODE rhs = f(q,t) -void subcycler_t::rhsf(occa::memory& o_U, occa::memory& o_RHS, const dfloat T){ +void subcycler_t::rhsf(deviceMemory& o_U, deviceMemory& o_RHS, const dfloat T){ //interpolate velocity history for advective field (halo elements first) if(mesh.NhaloElements) @@ -55,7 +44,7 @@ void subcycler_t::rhsf(occa::memory& o_U, occa::memory& o_RHS, const dfloat T){ o_Ue); // extract Ue halo - vTraceHalo->ExchangeStart(o_Ue, 1, ogs_dfloat); + vTraceHalo.ExchangeStart(o_Ue, 1); if(mesh.NinternalElements) subCycleAdvectionKernel(mesh.NinternalElements, @@ -71,10 +60,10 @@ void subcycler_t::rhsf(occa::memory& o_U, occa::memory& o_RHS, const dfloat T){ o_Ue); // finish exchange of Ue - vTraceHalo->ExchangeFinish(o_Ue, 1, ogs_dfloat); + vTraceHalo.ExchangeFinish(o_Ue, 1); // extract u halo on DEVICE - vTraceHalo->ExchangeStart(o_U, 1, ogs_dfloat); + vTraceHalo.ExchangeStart(o_U, 1); if (cubature) advectionVolumeKernel(mesh.Nelements, @@ -95,7 +84,7 @@ void subcycler_t::rhsf(occa::memory& o_U, occa::memory& o_RHS, const dfloat T){ o_U, o_RHS); - vTraceHalo->ExchangeFinish(o_U, 1, ogs_dfloat); + vTraceHalo.ExchangeFinish(o_U, 1); if (cubature) advectionSurfaceKernel(mesh.Nelements, diff --git a/solvers/ins/src/insVelocitySolve.cpp b/solvers/ins/src/insVelocitySolve.cpp index 69b6124e8..59f2a5c4f 100644 --- a/solvers/ins/src/insVelocitySolve.cpp +++ b/solvers/ins/src/insVelocitySolve.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,12 +27,13 @@ SOFTWARE. #include "ins.hpp" // Solves gamma*U - nu*Laplacian*U = rhs -void ins_t::VelocitySolve(occa::memory& o_U, occa::memory& o_RHS, +void ins_t::VelocitySolve(deviceMemory& o_U, deviceMemory& o_RHS, const dfloat gamma, const dfloat T) { // compute RHS = MM*RHS/nu + BCdata // and split fields to separate arrays velocityRhsKernel(mesh.Nelements, + mesh.o_wJ, mesh.o_vgeo, mesh.o_sgeo, mesh.o_ggeo, @@ -43,7 +44,7 @@ void ins_t::VelocitySolve(occa::memory& o_U, occa::memory& o_RHS, mesh.o_sM, mesh.o_vmapM, mesh.o_EToB, - o_mapB, + mesh.o_mapB, vTau, T, mesh.o_x, @@ -63,39 +64,39 @@ void ins_t::VelocitySolve(occa::memory& o_U, occa::memory& o_RHS, int maxIter = 5000; int verbose = 0; - uSolver->lambda = gamma/nu; - vSolver->lambda = gamma/nu; - wSolver->lambda = gamma/nu; + uSolver.lambda = gamma/nu; + vSolver.lambda = gamma/nu; + wSolver.lambda = gamma/nu; // Solve lambda*U - Laplacian*U = rhs if (vDisc_c0){ // gather, solve, scatter - uSolver->ogsMasked->Gather(o_GrhsU, o_rhsU, ogs_dfloat, ogs_add, ogs_trans); - NiterU = uSolver->Solve(*uLinearSolver, o_GUH, o_GrhsU, velTOL, maxIter, verbose); - uSolver->ogsMasked->Scatter(o_UH, o_GUH, ogs_dfloat, ogs_add, ogs_notrans); + uSolver.ogsMasked.Gather(o_GrhsU, o_rhsU, 1, ogs::Add, ogs::Trans); + NiterU = uSolver.Solve(uLinearSolver, o_GUH, o_GrhsU, velTOL, maxIter, verbose); + uSolver.ogsMasked.Scatter(o_UH, o_GUH, 1, ogs::NoTrans); - vSolver->ogsMasked->Gather(o_GrhsV, o_rhsV, ogs_dfloat, ogs_add, ogs_trans); - NiterV = vSolver->Solve(*vLinearSolver, o_GVH, o_GrhsV, velTOL, maxIter, verbose); - vSolver->ogsMasked->Scatter(o_VH, o_GVH, ogs_dfloat, ogs_add, ogs_notrans); + vSolver.ogsMasked.Gather(o_GrhsV, o_rhsV, 1, ogs::Add, ogs::Trans); + NiterV = vSolver.Solve(vLinearSolver, o_GVH, o_GrhsV, velTOL, maxIter, verbose); + vSolver.ogsMasked.Scatter(o_VH, o_GVH, 1, ogs::NoTrans); if (mesh.dim==3) { - wSolver->ogsMasked->Gather(o_GrhsW, o_rhsW, ogs_dfloat, ogs_add, ogs_trans); - NiterW = wSolver->Solve(*wLinearSolver, o_GWH, o_GrhsW, velTOL, maxIter, verbose); - wSolver->ogsMasked->Scatter(o_WH, o_GWH, ogs_dfloat, ogs_add, ogs_notrans); + wSolver.ogsMasked.Gather(o_GrhsW, o_rhsW, 1, ogs::Add, ogs::Trans); + NiterW = wSolver.Solve(wLinearSolver, o_GWH, o_GrhsW, velTOL, maxIter, verbose); + wSolver.ogsMasked.Scatter(o_WH, o_GWH, 1, ogs::NoTrans); } } else { - NiterU = uSolver->Solve(*uLinearSolver, o_UH, o_rhsU, velTOL, maxIter, verbose); - NiterV = vSolver->Solve(*vLinearSolver, o_VH, o_rhsV, velTOL, maxIter, verbose); + NiterU = uSolver.Solve(uLinearSolver, o_UH, o_rhsU, velTOL, maxIter, verbose); + NiterV = vSolver.Solve(vLinearSolver, o_VH, o_rhsV, velTOL, maxIter, verbose); if (mesh.dim==3) - NiterW = wSolver->Solve(*wLinearSolver, o_WH, o_rhsW, velTOL, maxIter, verbose); + NiterW = wSolver.Solve(wLinearSolver, o_WH, o_rhsW, velTOL, maxIter, verbose); } // merge arrays back, and enter BCs if C0 velocityBCKernel(mesh.Nelements, mesh.o_sgeo, mesh.o_vmapM, - o_mapB, + mesh.o_mapB, T, mesh.o_x, mesh.o_y, diff --git a/solvers/lbs/data/lbsGaussian2D.h b/solvers/lbs/data/lbsGaussian2D.h index 02ab2ee02..6cecc7760 100644 --- a/solvers/lbs/data/lbsGaussian2D.h +++ b/solvers/lbs/data/lbsGaussian2D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/lbs/data/lbsGaussian3D.h b/solvers/lbs/data/lbsGaussian3D.h index 88e5ef82e..d10d70434 100644 --- a/solvers/lbs/data/lbsGaussian3D.h +++ b/solvers/lbs/data/lbsGaussian3D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/lbs/data/lbsUniform2D.h b/solvers/lbs/data/lbsUniform2D.h index 36b31f7c4..fd2acaab8 100644 --- a/solvers/lbs/data/lbsUniform2D.h +++ b/solvers/lbs/data/lbsUniform2D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/lbs/data/lbsUniform3D.h b/solvers/lbs/data/lbsUniform3D.h index fb6c3722a..cef99167b 100644 --- a/solvers/lbs/data/lbsUniform3D.h +++ b/solvers/lbs/data/lbsUniform3D.h @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/solvers/lbs/lbs.hpp b/solvers/lbs/lbs.hpp index 1cc35411f..e4a29133a 100644 --- a/solvers/lbs/lbs.hpp +++ b/solvers/lbs/lbs.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) - Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -36,28 +36,30 @@ #define DLBS LIBP_DIR"/solvers/lbs/" +using namespace libp; + class lbsSettings_t: public settings_t { public: - lbsSettings_t(MPI_Comm& _comm); + lbsSettings_t(comm_t& _comm); void report(); void parseFromFile(platformSettings_t& platformSettings, meshSettings_t& meshSettings, - const string filename); + const std::string filename); }; class lbs_t: public solver_t { public: - mesh_t& mesh; + mesh_t mesh; int Nfields; int Nmacro; int Npmlfields; int velModel; - TimeStepper::timeStepper_t* timeStepper; + timeStepper_t timeStepper; - halo_t* traceHalo; - halo_t** multirateTraceHalo; + ogs::halo_t traceHalo; + memory multirateTraceHalo; // dfloat RT, c, tauInv, Ma, Re, nu; // Flow parameters dfloat RT, c, tauInv, Re, nu, alpha; // Flow parameters @@ -65,7 +67,7 @@ class lbs_t: public solver_t { // Pml int pmlOrder; dfloat sigmaXmax, sigmaYmax, sigmaZmax; - dfloat *pmlSigma; + memory pmlSigma; dfloat pmlAlpha; // Flag for using cubature integration for sigma terms in pml @@ -74,57 +76,56 @@ class lbs_t: public solver_t { // Flag for semi-analytic timestepping int semiAnalytic; - dfloat *q; - occa::memory o_q; - + memory q; + deviceMemory o_q; + // external forcing in velocity space - dfloat *F; - occa::memory o_F; - + memory F; + deviceMemory o_F; + // Macro quantities i.e. density + velocity - dfloat *U; - occa::memory o_U; + memory U; + deviceMemory o_U; - dfloat *LBM; - occa::memory o_LBM; + memory LBM; + deviceMemory o_LBM; - int *LMAP; - occa::memory o_LMAP; + memory LMAP; + deviceMemory o_LMAP; - occa::memory o_Mq; + deviceMemory o_Mq; - dfloat *Vort, *VortMag; - occa::memory o_Vort, o_VortMag; + memory Vort, VortMag; + deviceMemory o_Vort, o_VortMag; - occa::memory o_pmlSigma; + deviceMemory o_pmlSigma; - occa::kernel collisionKernel; - occa::kernel forcingKernel; - occa::kernel momentsKernel; - occa::kernel phaseFieldKernel; + kernel_t collisionKernel; + kernel_t forcingKernel; + kernel_t momentsKernel; + kernel_t phaseFieldKernel; - occa::kernel volumeKernel; - occa::kernel surfaceKernel; - occa::kernel relaxationKernel; + kernel_t volumeKernel; + kernel_t surfaceKernel; + kernel_t relaxationKernel; - occa::kernel pmlVolumeKernel; - occa::kernel pmlSurfaceKernel; - occa::kernel pmlRelaxationKernel; + kernel_t pmlVolumeKernel; + kernel_t pmlSurfaceKernel; + kernel_t pmlRelaxationKernel; - occa::kernel vorticityKernel; + kernel_t vorticityKernel; - occa::kernel initialConditionKernel; + kernel_t initialConditionKernel; - lbs_t() = delete; + lbs_t() = default; lbs_t(platform_t &_platform, mesh_t &_mesh, - lbsSettings_t& _settings): - solver_t(_platform, _settings), mesh(_mesh) {} - - ~lbs_t(); + lbsSettings_t& _settings) { + Setup(_platform, _mesh, _settings); + } //setup - static lbs_t& Setup(platform_t& platform, mesh_t& mesh, - lbsSettings_t& settings); + void Setup(platform_t& _platform, mesh_t& _mesh, + lbsSettings_t& _settings); void PmlSetup(); @@ -132,16 +133,15 @@ class lbs_t: public solver_t { void Report(dfloat time, int tstep); - void PlotFields(dfloat* Q, dfloat* V, char *fileName); + void PlotFields(memory& Q, memory& V, std::string fileName); dfloat MaxWaveSpeed(); + void rhsf(deviceMemory& o_Q, deviceMemory& o_RHS, const dfloat T); - void rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T); - - void rhsVolume(dlong N, occa::memory& o_Q, occa::memory& o_RHS, const dfloat T); + void rhsVolume(dlong N, deviceMemory& o_Q, deviceMemory& o_RHS, const dfloat T); - void rhsSurface(dlong N, occa::memory& o_Q, occa::memory& o_RHS, const dfloat T); + void rhsSurface(dlong N, deviceMemory& o_Q, deviceMemory& o_RHS, const dfloat T); void latticeSetup(); }; diff --git a/solvers/lbs/lbsMain.cpp b/solvers/lbs/lbsMain.cpp index aa3861168..fc77abf7e 100644 --- a/solvers/lbs/lbsMain.cpp +++ b/solvers/lbs/lbsMain.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -29,39 +29,40 @@ SOFTWARE. int main(int argc, char **argv){ // start up MPI - MPI_Init(&argc, &argv); + Comm::Init(argc, argv); - MPI_Comm comm = MPI_COMM_WORLD; + LIBP_ABORT("Usage: ./lbsMain setupfile", argc!=2); - if(argc!=2) - LIBP_ABORT(string("Usage: ./lbsMain setupfile")); + { /*Scope so everything is destructed before MPI_Finalize */ + comm_t comm(Comm::World().Dup()); - //create default settings - platformSettings_t platformSettings(comm); - meshSettings_t meshSettings(comm); - lbsSettings_t lbsSettings(comm); + //create default settings + platformSettings_t platformSettings(comm); + meshSettings_t meshSettings(comm); + lbsSettings_t lbsSettings(comm); + //load settings from file + lbsSettings.parseFromFile(platformSettings, meshSettings, + argv[1]); - //load settings from file - lbsSettings.parseFromFile(platformSettings, meshSettings, - argv[1]); + // set up platform + platform_t platform(platformSettings); - // set up platform - platform_t platform(platformSettings); + platformSettings.report(); + meshSettings.report(); + lbsSettings.report(); - platformSettings.report(); - meshSettings.report(); - lbsSettings.report(); + // set up mesh + mesh_t mesh(platform, meshSettings, comm); - // set up mesh - mesh_t& mesh = mesh_t::Setup(platform, meshSettings, comm); + // set up lbs solver + lbs_t lbs(platform, mesh, lbsSettings); - // set up lbs solver - lbs_t& lbs = lbs_t::Setup(platform, mesh, lbsSettings); + // run + lbs.Run(); + } - // run - lbs.Run(); // close down MPI - MPI_Finalize(); + Comm::Finalize(); return LIBP_SUCCESS; } diff --git a/solvers/lbs/makefile b/solvers/lbs/makefile index e5f175365..87f127636 100644 --- a/solvers/lbs/makefile +++ b/solvers/lbs/makefile @@ -2,7 +2,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal @@ -77,26 +77,22 @@ include ../../make.top endif endif -#gslib -GS_DIR=${LIBP_TPL_DIR}/gslib - #libraries -LBS_LIBP_LIBS=timeStepper mesh ogs linAlg core +LBS_LIBP_LIBS=timeStepper mesh parAdogs ogs linAlg core #includes INCLUDES=${LIBP_INCLUDES} \ - -I. + -I. #defines DEFINES =${LIBP_DEFINES} \ - -DLIBP_DIR='"${LIBP_DIR}"' + -DLIBP_DIR='"${LIBP_DIR}"' #.cpp compilation flags -LBS_CXXFLAGS=${LIBP_MPICXXFLAGS} ${DEFINES} ${INCLUDES} +LBS_CXXFLAGS=${LIBP_CXXFLAGS} ${DEFINES} ${INCLUDES} #link libraries LIBS=-L${LIBP_LIBS_DIR} $(addprefix -l,$(LBS_LIBP_LIBS)) \ - -L$(GS_DIR)/lib -lgs \ - ${LIBP_LIBS} + ${LIBP_LIBS} #link flags LFLAGS=${LBS_CXXFLAGS} ${LIBS} @@ -111,7 +107,7 @@ SRC =$(wildcard src/*.cpp) OBJS=$(SRC:.cpp=.o) .PHONY: all lib libp_libs clean clean-libs \ -clean-kernels realclean help info + clean-kernels realclean help info all: lbsMain @@ -143,10 +139,10 @@ endif # rule for .cpp files %.o: %.cpp $(DEPS) | libp_libs ifneq (,${verbose}) - $(LIBP_MPICXX) -o $*.o -c $*.cpp $(LBS_CXXFLAGS) + $(LIBP_CXX) -o $*.o -c $*.cpp $(LBS_CXXFLAGS) else @printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n"; - @$(LIBP_MPICXX) -o $*.o -c $*.cpp $(LBS_CXXFLAGS) + @$(LIBP_CXX) -o $*.o -c $*.cpp $(LBS_CXXFLAGS) endif #cleanup @@ -157,8 +153,7 @@ clean-libs: clean ${MAKE} -C ${LIBP_LIBS_DIR} clean clean-kernels: clean-libs -# $(shell ${OCCA_DIR}/bin/occa clear all -y) - rm -rf ~/.occa/ + rm -rf ${LIBP_DIR}/.occa/ realclean: clean ${MAKE} -C ${LIBP_LIBS_DIR} realclean diff --git a/solvers/lbs/okl/lbsCollisionHex3D.okl b/solvers/lbs/okl/lbsCollisionHex3D.okl index cbb9b4aa7..ade97fdab 100644 --- a/solvers/lbs/okl/lbsCollisionHex3D.okl +++ b/solvers/lbs/okl/lbsCollisionHex3D.okl @@ -2,7 +2,7 @@ The MIT License (MIT) - Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -36,178 +36,178 @@ void equiDist3D(const dfloat ew, const dfloat ex, const dfloat ey, const dfloat // Compute collision step, physical velocity and scaled external forcing @kernel void lbsCollisionHex3D(const dlong Nelements, - // @restrict const dlong * elementIds, - const dfloat t, - const dfloat dt, - const dfloat gamma, // lambda/dt - const dfloat nu, // 1/Re - @restrict const dfloat * LBM, - @restrict const dfloat * x, - @restrict const dfloat * y, - @restrict const dfloat * z, - @restrict const dfloat * F, - @restrict const dfloat * U, - @restrict dfloat * q){ + // @restrict const dlong * elementIds, + const dfloat t, + const dfloat dt, + const dfloat gamma, // lambda/dt + const dfloat nu, // 1/Re + @restrict const dfloat * LBM, + @restrict const dfloat * x, + @restrict const dfloat * y, + @restrict const dfloat * z, + @restrict const dfloat * F, + @restrict const dfloat * U, + @restrict dfloat * q){ for(dlong e=0;e0){ const dfloat ew = LBM[fld + 0*p_Nfields]; const int idr = LMAP[fld]; - applyBC2D(bc, fld, idr, dt, en, nx, ny, ew, ex, ey,rb, ub, vb, fn, qm, qp); + applyBC2D(bc, fld, idr, dt, en, nx, ny, ew, ex, ey,rb, ub, vb, fn, qm, qp); } s_fluxq[es][fld][j][i] += 0.5f*sc*(en -fabs(en))*(qm[fld] - qp[fld]); } } - - + + @kernel void lbsSurfaceQuad2D(const dlong Nelements, - // @restrict const dlong * elementIds, - @restrict const dfloat * sgeo, - @restrict const dfloat * LIFT, - @restrict const dlong * vmapM, - @restrict const dlong * vmapP, - @restrict const int * EToB, - @restrict const dfloat * x, - @restrict const dfloat * y, - @restrict const dfloat * z, - const dfloat dt, - const dfloat time, - const dfloat nu, - @restrict const int * LMAP, - @restrict const dfloat * LBM, - @restrict const dfloat * F, - @restrict const dfloat * U, - @restrict const dfloat * q, - @restrict dfloat * rhsq){ + // @restrict const dlong * elementIds, + @restrict const dfloat * sgeo, + @restrict const dfloat * LIFT, + @restrict const dlong * vmapM, + @restrict const dlong * vmapP, + @restrict const int * EToB, + @restrict const dfloat * x, + @restrict const dfloat * y, + @restrict const dfloat * z, + const dfloat dt, + const dfloat time, + const dfloat nu, + @restrict const int * LMAP, + @restrict const dfloat * LBM, + @restrict const dfloat * F, + @restrict const dfloat * U, + @restrict const dfloat * q, + @restrict dfloat * rhsq){ // for all elements for(dlong eo=0;eo0 || bc==-1){ - const dlong uidM = eM*p_Np*p_Nmacro + vidM; - rm = U[uidM+ 0*p_Np]; - um = U[uidM+ 1*p_Np]; - vm = U[uidM+ 2*p_Np]; - wm = U[uidM+ 3*p_Np]; - - lbsBoundaryConditions3D(bc, nu, time, x[idM], y[idM], z[idM], nx, ny, nz, - rm, um, vm, wm, - &rb, &ub, &vb, &wb); - } + for(int n=0;n0 || bc==-1){ + const dlong uidM = eM*p_Np*p_Nmacro + vidM; + rm = U[uidM+ 0*p_Np]; + um = U[uidM+ 1*p_Np]; + vm = U[uidM+ 2*p_Np]; + wm = U[uidM+ 3*p_Np]; + + lbsBoundaryConditions3D(bc, nu, time, x[idM], y[idM], z[idM], nx, ny, nz, + rm, um, vm, wm, + &rb, &ub, &vb, &wb); + } - dfloat qm[p_Nfields]; - dfloat qp[p_Nfields]; + dfloat qm[p_Nfields]; + dfloat qp[p_Nfields]; - #pragma unroll p_Nfields - for(int fld=0; fld0){ - const dfloat ew = LBM[fld + 0*p_Nfields]; - const int idr = LMAP[fld]; - applyBC3D(bc, fld, idr, dt, en, nx, ny, nz, ew, ex, ey, ez, rb, ub, vb, wb, fn, qm, qp); - } - - s_fluxq[fld][n] = 0.5f*sc*(en -fabs(en))*(qm[fld] - qp[fld]); + #pragma unroll p_Nfields + for(int fld=0; fld0){ + const dfloat ew = LBM[fld + 0*p_Nfields]; + const int idr = LMAP[fld]; + applyBC3D(bc, fld, idr, dt, en, nx, ny, nz, ew, ex, ey, ez, rb, ub, vb, wb, fn, qm, qp); } + + s_fluxq[fld][n] = 0.5f*sc*(en -fabs(en))*(qm[fld] - qp[fld]); } } } + } - // wait for all @shared memory writes of the previous inner loop to complete - @barrier("local"); - // for each node in the element - for(int n=0;n(LBM); + o_LMAP = platform.malloc(LMAP); } diff --git a/solvers/lbs/src/lbsPlotFields.cpp b/solvers/lbs/src/lbsPlotFields.cpp index 75e17e849..ca266f601 100644 --- a/solvers/lbs/src/lbsPlotFields.cpp +++ b/solvers/lbs/src/lbsPlotFields.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) - Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,11 +27,11 @@ #include "lbs.hpp" // interpolate data to plot nodes and save to file (one per process) -void lbs_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){ +void lbs_t::PlotFields(memory& Q, memory& V, std::string fileName){ FILE *fp; - fp = fopen(fileName, "w"); + fp = fopen(fileName.c_str(), "w"); fprintf(fp, "\n"); fprintf(fp, " \n"); @@ -44,36 +44,42 @@ void lbs_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){ fprintf(fp, " \n"); //scratch space for interpolation - size_t NscratchBytes = mymax(mesh.Np, mesh.plotNp)*sizeof(dfloat); - dfloat* scratch = (dfloat *) malloc(2*NscratchBytes); + size_t Nscratch = std::max(mesh.Np, mesh.plotNp); + memory scratch(2*Nscratch); - dfloat* Ix = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iy = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iz = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); + memory Ix(mesh.plotNp); + memory Iy(mesh.plotNp); + memory Iz(mesh.plotNp); // compute plot node coordinates on the fly for(dlong e=0;e\n"); fprintf(fp, " \n"); - free(Ix); free(Iy); free(Iz); - - dfloat* Ir = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iu = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iv = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); - dfloat* Iw = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat)); + memory Ir(mesh.plotNp); + memory Iu(mesh.plotNp); + memory Iv(mesh.plotNp); + memory Iw(mesh.plotNp); fprintf(fp, " \n"); - if (U!=nullptr) { + if (U.length()!=0) { // write out velocity fprintf(fp, " \n", mesh.dim); for(dlong e=0;e\n"); } - if (U!=nullptr) { + if (U.length()!=0) { // write out pressure fprintf(fp, " \n"); for(dlong e=0;e\n"); } - if (V!=nullptr) { + if (V.length()!=0) { // write out vorticity if(mesh.dim==2){ fprintf(fp, " \n"); @@ -138,8 +144,6 @@ void lbs_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){ } fprintf(fp, " \n"); - free(Ir); free(Iu); free(Iv); free(Iw); - fprintf(fp, " \n"); fprintf(fp, " \n"); @@ -180,6 +184,4 @@ void lbs_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){ fprintf(fp, " \n"); fprintf(fp, "\n"); fclose(fp); - - free(scratch); } diff --git a/solvers/lbs/src/lbsPmlSetup.cpp b/solvers/lbs/src/lbsPmlSetup.cpp index af00e57c2..b4796acb3 100644 --- a/solvers/lbs/src/lbsPmlSetup.cpp +++ b/solvers/lbs/src/lbsPmlSetup.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) - Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -108,7 +108,7 @@ void lbs_t::PmlSetup(){ int pmlNp = (pmlcubature) ? mesh.cubNp : mesh.Np; int pmlNq = (pmlcubature) ? mesh.cubNq : mesh.Nq; - dfloat *pmlr, *pmls, *pmlt; + memory pmlr, pmls, pmlt; if(pmlcubature){ pmlr = mesh.cubr; pmls = mesh.cubs; @@ -121,27 +121,27 @@ void lbs_t::PmlSetup(){ // printf("Setting PML Coefficient \n"); //set up damping parameter - pmlSigma = (dfloat *) calloc(mesh.dim*mesh.NpmlElements*pmlNp,sizeof(dfloat)); + pmlSigma.malloc(mesh.dim*mesh.NpmlElements*pmlNp); for (dlong m=0;m xe = mesh.EX + e*mesh.Nverts; + memory ye = mesh.EY + e*mesh.Nverts; + memory ze = mesh.EZ + e*mesh.Nverts; for(int n=0;n(pmlSigma); } } diff --git a/solvers/lbs/src/lbsReport.cpp b/solvers/lbs/src/lbsReport.cpp index a2de81dd0..b249a764e 100644 --- a/solvers/lbs/src/lbsReport.cpp +++ b/solvers/lbs/src/lbsReport.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) - Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -38,7 +38,7 @@ void lbs_t::Report(dfloat time, int tstep){ mesh.MassMatrixApply(o_U, o_Mq); dlong Nentries = mesh.Nelements*mesh.Np*Nmacro; - dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_q, o_Mq, mesh.comm)); + dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_q, o_Mq, mesh.comm)); if(mesh.rank==0) printf("%5.2f (%d), %5.4f (time, timestep, norm)\n", time, tstep, norm2); @@ -51,12 +51,12 @@ void lbs_t::Report(dfloat time, int tstep){ o_Vort.copyTo(Vort); // output field files - string name; + std::string name; settings.getSetting("OUTPUT FILE NAME", name); char fname[BUFSIZ]; sprintf(fname, "%s_%04d_%04d.vtu", name.c_str(), mesh.rank, frame++); // PlotFields(o_q, Vort, fname); - PlotFields(U, Vort, fname); + PlotFields(U, Vort, std::string(fname)); } } diff --git a/solvers/lbs/src/lbsRun.cpp b/solvers/lbs/src/lbsRun.cpp index 1f352ef04..75b8e5bd0 100644 --- a/solvers/lbs/src/lbsRun.cpp +++ b/solvers/lbs/src/lbsRun.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) - Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -64,8 +64,8 @@ void lbs_t::Run(){ mesh.o_x, mesh.o_y, mesh.o_z, - o_U, - o_q); + o_U, + o_q); /* Artificial warping of time step size for multirate testing @@ -75,9 +75,9 @@ void lbs_t::Run(){ settings.compareSetting("TIME INTEGRATOR","MRSAAB3")) dt /= (1<<(mesh.mrNlevels-1)); #endif - timeStepper->SetTimeStep(dt); + timeStepper.SetTimeStep(dt); - timeStepper->Run(o_q, startTime, finalTime); + timeStepper.Run(*this, o_q, startTime, finalTime); // output norm of final solution @@ -88,7 +88,7 @@ void lbs_t::Run(){ mesh.MassMatrixApply(o_U, o_Mq); dlong Nentries = mesh.Nelements*mesh.Np*Nmacro; - dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_q, o_Mq, mesh.comm)); + dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_q, o_Mq, mesh.comm)); if(mesh.rank==0) printf("Solution norm = %17.15lg\n", norm2); diff --git a/solvers/lbs/src/lbsSettings.cpp b/solvers/lbs/src/lbsSettings.cpp index 6c43eb175..f34abf8ea 100644 --- a/solvers/lbs/src/lbsSettings.cpp +++ b/solvers/lbs/src/lbsSettings.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) - Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,7 +27,7 @@ #include "lbs.hpp" //settings for lbs solver -lbsSettings_t::lbsSettings_t(MPI_Comm& _comm): +lbsSettings_t::lbsSettings_t(comm_t& _comm): settings_t(_comm) { newSetting("DATA FILE", @@ -96,10 +96,7 @@ lbsSettings_t::lbsSettings_t(MPI_Comm& _comm): void lbsSettings_t::report() { - int rank; - MPI_Comm_rank(comm, &rank); - - if (rank==0) { + if (comm.rank()==0) { std::cout << "LBS Settings:\n\n"; reportSetting("DATA FILE"); // reportSetting("SPEED OF SOUND"); @@ -120,15 +117,15 @@ void lbsSettings_t::report() { void lbsSettings_t::parseFromFile(platformSettings_t& platformSettings, meshSettings_t& meshSettings, - const string filename) { + const std::string filename) { //read all settings from file settings_t s(comm); s.readSettingsFromFile(filename); for(auto it = s.settings.begin(); it != s.settings.end(); ++it) { - setting_t* set = it->second; - const string name = set->getName(); - const string val = set->getVal(); + setting_t& set = it->second; + const std::string name = set.getName(); + const std::string val = set.getVal(); if (platformSettings.hasSetting(name)) platformSettings.changeSetting(name, val); else if (meshSettings.hasSetting(name)) @@ -136,9 +133,7 @@ void lbsSettings_t::parseFromFile(platformSettings_t& platformSettings, else if (hasSetting(name)) //self changeSetting(name, val); else { - stringstream ss; - ss << "Unknown setting: [" << name << "] requested"; - LIBP_ABORT(ss.str()); + LIBP_FORCE_ABORT("Unknown setting: [" << name << "] requested"); } } } diff --git a/solvers/lbs/src/lbsSetup.cpp b/solvers/lbs/src/lbsSetup.cpp index 682236cb7..0b18b950f 100644 --- a/solvers/lbs/src/lbsSetup.cpp +++ b/solvers/lbs/src/lbsSetup.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) - Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -28,185 +28,162 @@ #define D2Q9 1 #define D3Q15 2 +void lbs_t::Setup(platform_t& _platform, mesh_t& _mesh, + lbsSettings_t& _settings){ -lbs_t& lbs_t::Setup(platform_t& platform, mesh_t& mesh, - lbsSettings_t& settings){ + platform = _platform; + mesh = _mesh; + comm = _mesh.comm; + settings = _settings; - lbs_t* lbs = new lbs_t(platform, mesh, settings); + //Trigger JIT kernel builds + ogs::InitializeKernels(platform, ogs::Dfloat, ogs::Add); // Set reference lattice-Boltzmann data - lbs->latticeSetup(); + latticeSetup(); - lbs->Npmlfields = mesh.dim*lbs->Nfields; + Npmlfields = mesh.dim*Nfields; // AK: not in use yet ... Setup PML - // lbs->PmlSetup(); + // PmlSetup(); //setup timeStepper - dlong Nlocal = mesh.Nelements*mesh.Np*lbs->Nfields; - dlong Nhalo = mesh.totalHaloPairs*mesh.Np*lbs->Nfields; + dlong Nlocal = mesh.Nelements*mesh.Np*Nfields; + dlong Nhalo = mesh.totalHaloPairs*mesh.Np*Nfields; //make array of time step estimates for each element - dfloat *EtoDT = (dfloat *) calloc(mesh.Nelements,sizeof(dfloat)); - dfloat vmax = lbs->MaxWaveSpeed(); + memory EtoDT(mesh.Nelements); + dfloat vmax = MaxWaveSpeed(); for(dlong e=0;etimeStepper = new TimeStepper::lserk4(mesh.Nelements, mesh.totalHaloPairs, - mesh.Np, lbs->Nfields, *lbs); + timeStepper.Setup(mesh.Nelements, + mesh.totalHaloPairs, + mesh.Np, Nfields, + platform, comm); }else { - LIBP_ABORT(string("Requested TIME INTEGRATOR not found.")); + LIBP_FORCE_ABORT("Requested TIME INTEGRATOR not found."); } //setup linear algebra module - platform.linAlg.InitKernels({"innerProd"}); + platform.linAlg().InitKernels({"innerProd"}); /*setup trace halo exchange */ - lbs->traceHalo = mesh.HaloTraceSetup(lbs->Nfields); + traceHalo = mesh.HaloTraceSetup(Nfields); // compute samples of q at interpolation nodes - lbs->q = (dfloat*) calloc(Nlocal+Nhalo, sizeof(dfloat)); - lbs->o_q = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), lbs->q); + q.malloc(Nlocal+Nhalo, 0.0); + o_q = platform.malloc(q); - lbs->F = (dfloat*) calloc(Nlocal+Nhalo, sizeof(dfloat)); - lbs->o_F = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), lbs->F); + F.malloc(Nlocal+Nhalo, 0.0); + o_F = platform.malloc(F); - lbs->Vort = (dfloat*) calloc(mesh.dim*mesh.Nelements*mesh.Np, sizeof(dfloat)); - lbs->o_Vort = platform.malloc((mesh.dim*mesh.Nelements*mesh.Np)*sizeof(dfloat), - lbs->Vort); + Vort.malloc(mesh.dim*mesh.Nelements*mesh.Np, 0.0); + o_Vort = platform.malloc(Vort); // Hold macro quantites i.e. density + velocities - lbs->U = (dfloat*) calloc((mesh.Nelements+mesh.totalHaloPairs)*mesh.Np*lbs->Nmacro, sizeof(dfloat)); - lbs->o_U = platform.malloc((mesh.Nelements+mesh.totalHaloPairs)*mesh.Np*lbs->Nmacro*sizeof(dfloat), lbs->U); - - // Lattice-Boltzmann Model - lbs->o_LBM = platform.malloc(lbs->Nfields*lbs->Nmacro*sizeof(dfloat), lbs->LBM); - lbs->o_LMAP = platform.malloc(lbs->Nfields*sizeof(int), lbs->LMAP); - + U.malloc((mesh.Nelements+mesh.totalHaloPairs)*mesh.Np*Nmacro, 0.0); + o_U = platform.malloc(U); - //storage for M*q during reporting - lbs->o_Mq = platform.malloc((mesh.Nelements+mesh.totalHaloPairs)*mesh.Np*lbs->Nmacro*sizeof(dfloat), lbs->U); - mesh.MassMatrixKernelSetup(lbs->Nmacro); // mass matrix operator + o_Mq = platform.malloc(U); + mesh.MassMatrixKernelSetup(Nmacro); // mass matrix operator // // OCCA build stuff - occa::properties kernelInfo = mesh.props; //copy base occa properties + properties_t kernelInfo = mesh.props; //copy base occa properties //add boundary data to kernel info - string dataFileName; + std::string dataFileName; settings.getSetting("DATA FILE", dataFileName); kernelInfo["includes"] += dataFileName; - kernelInfo["defines/" "p_Nfields"]= lbs->Nfields; - // kernelInfo["defines/" "p_Npmlfields"]= lbs->Npmlfields; - kernelInfo["defines/" "p_Nmacro"] = lbs->Nmacro; + kernelInfo["defines/" "p_Nfields"]= Nfields; + // kernelInfo["defines/" "p_Npmlfields"]= Npmlfields; + kernelInfo["defines/" "p_Nmacro"] = Nmacro; - kernelInfo["defines/" "p_c"] = lbs->c; - kernelInfo["defines/" "p_ic2"] = 1.0/ pow(lbs->c,2); - kernelInfo["defines/" "p_ic4"] = 1.0/ pow(lbs->c,4); + kernelInfo["defines/" "p_c"] = c; + kernelInfo["defines/" "p_ic2"] = 1.0/ pow(c,2); + kernelInfo["defines/" "p_ic4"] = 1.0/ pow(c,4); - int maxNodes = mymax(mesh.Np, (mesh.Nfp*mesh.Nfaces)); + int maxNodes = std::max(mesh.Np, (mesh.Nfp*mesh.Nfaces)); kernelInfo["defines/" "p_maxNodes"]= maxNodes; int blockMax = 256; if (platform.device.mode()=="CUDA") blockMax = 512; - int NblockV = mymax(1, blockMax/mesh.Np); + int NblockV = std::max(1, blockMax/mesh.Np); kernelInfo["defines/" "p_NblockV"]= NblockV; - int NblockS = mymax(1, blockMax/maxNodes); + int NblockS = std::max(1, blockMax/maxNodes); kernelInfo["defines/" "p_NblockS"]= NblockS; - kernelInfo["parser/" "automate-add-barriers"] = "disabled"; - // set kernel name suffix - char *suffix; - if(mesh.elementType==TRIANGLES) - suffix = strdup("Tri2D"); - if(mesh.elementType==QUADRILATERALS) - suffix = strdup("Quad2D"); - if(mesh.elementType==TETRAHEDRA) - suffix = strdup("Tet3D"); - if(mesh.elementType==HEXAHEDRA) - suffix = strdup("Hex3D"); - - char fileName[BUFSIZ], kernelName[BUFSIZ]; + std::string suffix; + if(mesh.elementType==Mesh::TRIANGLES) + suffix = "Tri2D"; + if(mesh.elementType==Mesh::QUADRILATERALS) + suffix = "Quad2D"; + if(mesh.elementType==Mesh::TETRAHEDRA) + suffix = "Tet3D"; + if(mesh.elementType==Mesh::HEXAHEDRA) + suffix = "Hex3D"; + + std::string oklFilePrefix = DLBS "/okl/"; + std::string oklFileSuffix = ".okl"; + + std::string fileName, kernelName; if (mesh.dim==2) { - sprintf(fileName, DLBS "/okl/lbsInitialCondition2D.okl"); - sprintf(kernelName, "lbsInitialCondition2D"); + fileName = oklFilePrefix + "lbsInitialCondition2D" + oklFileSuffix; + kernelName = "lbsInitialCondition2D"; } else { - sprintf(fileName, DLBS "/okl/lbsInitialCondition3D.okl"); - sprintf(kernelName, "lbsInitialCondition3D"); + fileName = oklFilePrefix + "lbsInitialCondition3D" + oklFileSuffix; + kernelName = "lbsInitialCondition3D"; } - lbs->initialConditionKernel = platform.buildKernel(fileName, kernelName, + initialConditionKernel = platform.buildKernel(fileName, kernelName, kernelInfo); // kernels from volume file - sprintf(fileName, DLBS "/okl/lbsCollision%s.okl", suffix); + fileName = oklFilePrefix + "lbsCollision" + suffix + oklFileSuffix; - sprintf(kernelName, "lbsCollision%s", suffix); - lbs->collisionKernel = platform.buildKernel(fileName, kernelName, + kernelName = "lbsCollision" + suffix; + collisionKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(kernelName, "lbsForcing%s", suffix); - lbs->forcingKernel = platform.buildKernel(fileName, kernelName, + kernelName = "lbsForcing" + suffix; + forcingKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(kernelName, "lbsMoments%s", suffix); - lbs->momentsKernel = platform.buildKernel(fileName, kernelName, + kernelName = "lbsMoments" + suffix; + momentsKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - sprintf(kernelName, "lbsPhaseField%s", suffix); - lbs->phaseFieldKernel = platform.buildKernel(fileName, kernelName, + kernelName = "lbsPhaseField" + suffix; + phaseFieldKernel = platform.buildKernel(fileName, kernelName, kernelInfo); // kernels from volume file - sprintf(fileName, DLBS "/okl/lbsVolume%s.okl", suffix); - sprintf(kernelName, "lbsVolume%s", suffix); - lbs->volumeKernel = platform.buildKernel(fileName, kernelName, + fileName = oklFilePrefix + "lbsVolume" + suffix + oklFileSuffix; + kernelName = "lbsVolume" + suffix; + volumeKernel = platform.buildKernel(fileName, kernelName, kernelInfo); // kernels from surface file - sprintf(fileName, DLBS "/okl/lbsSurface%s.okl", suffix); - - sprintf(kernelName, "lbsSurface%s", suffix); - lbs->surfaceKernel = platform.buildKernel(fileName, kernelName, + fileName = oklFilePrefix + "lbsSurface" + suffix + oklFileSuffix; + kernelName = "lbsSurface" + suffix; + surfaceKernel = platform.buildKernel(fileName, kernelName, kernelInfo); // vorticity calculation - sprintf(fileName, DLBS "/okl/lbsVorticity%s.okl", suffix); - sprintf(kernelName, "lbsVorticity%s", suffix); + fileName = oklFilePrefix + "lbsVorticity" + suffix + oklFileSuffix; + kernelName = "lbsVorticity" + suffix; - lbs->vorticityKernel = platform.buildKernel(fileName, kernelName, + vorticityKernel = platform.buildKernel(fileName, kernelName, kernelInfo); - - - - return *lbs; -} - -lbs_t::~lbs_t() { - volumeKernel.free(); - surfaceKernel.free(); - relaxationKernel.free(); - pmlVolumeKernel.free(); - pmlSurfaceKernel.free(); - pmlRelaxationKernel.free(); - vorticityKernel.free(); - initialConditionKernel.free(); - - if (timeStepper) delete timeStepper; - if (traceHalo) traceHalo->Free(); - - for (int lev=0;levFree(); } diff --git a/solvers/lbs/src/lbsStep.cpp b/solvers/lbs/src/lbsStep.cpp index cc7ab35d4..d0e861555 100644 --- a/solvers/lbs/src/lbsStep.cpp +++ b/solvers/lbs/src/lbsStep.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) - Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -32,53 +32,53 @@ dfloat lbs_t::MaxWaveSpeed(){ } //evaluate ODE rhs = f(q,t) -void lbs_t::rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){ +void lbs_t::rhsf(deviceMemory& o_Q, deviceMemory& o_RHS, const dfloat T){ // extract q trace halo and start exchange - traceHalo->ExchangeStart(o_Q, 1, ogs_dfloat); + traceHalo.ExchangeStart(o_Q, 1); // compute volume contribution to lbs RHS rhsVolume(mesh.Nelements, o_Q, o_RHS, T); // complete trace halo exchange - traceHalo->ExchangeFinish(o_Q, 1, ogs_dfloat); + traceHalo.ExchangeFinish(o_Q, 1); // compute surface contribution to lbs RHS rhsSurface(mesh.Nelements, o_Q, o_RHS, T); } -void lbs_t::rhsVolume(dlong N, occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){ +void lbs_t::rhsVolume(dlong N, deviceMemory& o_Q, deviceMemory& o_RHS, const dfloat T){ // compute volume contribution to lbs RHS if (N){ - const dfloat dt = timeStepper->GetTimeStep(); - const dfloat gamma = alpha/timeStepper->GetTimeStep(); + const dfloat dt = timeStepper.GetTimeStep(); + const dfloat gamma = alpha/timeStepper.GetTimeStep(); forcingKernel(N, - T, - dt, - gamma, - nu, - o_LBM, - mesh.o_x, - mesh.o_y, - mesh.o_z, - o_Q, - o_F, - o_U); + T, + dt, + gamma, + nu, + o_LBM, + mesh.o_x, + mesh.o_y, + mesh.o_z, + o_Q, + o_F, + o_U); collisionKernel(N, - T, - dt, - gamma, - nu, - o_LBM, - mesh.o_x, - mesh.o_y, - mesh.o_z, - o_F, - o_U, - o_Q); + T, + dt, + gamma, + nu, + o_LBM, + mesh.o_x, + mesh.o_y, + mesh.o_z, + o_F, + o_U, + o_Q); volumeKernel(N, mesh.o_vgeo, @@ -97,9 +97,9 @@ void lbs_t::rhsVolume(dlong N, occa::memory& o_Q, occa::memory& o_RHS, const dfl } -void lbs_t::rhsSurface(dlong N, occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){ +void lbs_t::rhsSurface(dlong N, deviceMemory& o_Q, deviceMemory& o_RHS, const dfloat T){ - const dfloat dt = timeStepper->GetTimeStep(); + const dfloat dt = timeStepper.GetTimeStep(); // // compute volume contribution to lbs RHS if (N) surfaceKernel(N, diff --git a/test/makefile b/test/makefile index 3e9793b87..2e2b3da30 100644 --- a/test/makefile +++ b/test/makefile @@ -2,7 +2,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal diff --git a/test/test.py b/test/test.py index dfd8608aa..f182435c8 100755 --- a/test/test.py +++ b/test/test.py @@ -4,7 +4,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal @@ -88,20 +88,20 @@ def __init__(self, name, value): self.name = name self.value = value -def writeSetup(settings): +def writeSetup(filename, settings): str_settings="" for setting in settings: str_settings += "[" + setting.name + "]\n" str_settings += str(setting.value) + "\n\n" - file = open("setup.rc", "w") + file = open(filename+".rc", "w") file.write(str_settings) file.close() def test(name, cmd, settings, referenceNorm, ranks=1): #create input file - writeSetup(settings) + writeSetup("setup",settings) #print test name print(bcolors.TEST + f"{name:.<{alignWidth}}" + bcolors.ENDC, end="", flush=True) @@ -117,6 +117,8 @@ def test(name, cmd, settings, referenceNorm, ranks=1): print(run.stdout.decode()) print(bcolors.WARNING + name + " stderr:" + bcolors.ENDC) print(run.stderr.decode()) + #save the setup for reproducibility + writeSetup(name,settings) failed = 1 else: #collect last line of output @@ -133,6 +135,8 @@ def test(name, cmd, settings, referenceNorm, ranks=1): print(bcolors.FAIL + "FAIL" + bcolors.ENDC) print(bcolors.WARNING + "Expected Result: " + str(referenceNorm) + bcolors.ENDC) print(bcolors.WARNING + "Observed Result: " + str(norm) + bcolors.ENDC) + #save the setup for reproducibility + writeSetup(name,settings) failed = 1 else: #this failure is worse, so dump the whole output for debug @@ -141,8 +145,16 @@ def test(name, cmd, settings, referenceNorm, ranks=1): print(run.stdout.decode()) print(bcolors.WARNING + name + " stderr:" + bcolors.ENDC) print(run.stderr.decode()) + #save the setup for reproducibility + writeSetup(name,settings) failed = 1 + # writeSetup(name,settings) + # print(bcolors.WARNING + name + " stdout:" + bcolors.ENDC) + # print(run.stdout.decode()) + # print(bcolors.WARNING + name + " stderr:" + bcolors.ENDC) + # print(run.stderr.decode()) + #clean up os.remove(inputRC) @@ -162,10 +174,12 @@ def test(name, cmd, settings, referenceNorm, ranks=1): import testTimeStepper import testLinearSolver import testParAlmond + import testParAdogs import testInitialGuess failCount=0; failCount+=testMesh.main() + failCount+=testParAdogs.main() failCount+=testGradient.main() failCount+=testAdvection.main() failCount+=testAcoustics.main() diff --git a/test/testAcoustics.py b/test/testAcoustics.py index 03c443dfc..51932ccb6 100755 --- a/test/testAcoustics.py +++ b/test/testAcoustics.py @@ -4,7 +4,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal diff --git a/test/testAdvection.py b/test/testAdvection.py index 632864b0d..1f1fe4ceb 100755 --- a/test/testAdvection.py +++ b/test/testAdvection.py @@ -4,7 +4,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal diff --git a/test/testBns.py b/test/testBns.py index ff684d498..73a39db5c 100755 --- a/test/testBns.py +++ b/test/testBns.py @@ -4,7 +4,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal diff --git a/test/testCns.py b/test/testCns.py index 38f8bf173..226ce64bc 100755 --- a/test/testCns.py +++ b/test/testCns.py @@ -4,7 +4,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal diff --git a/test/testElliptic.py b/test/testElliptic.py index 0c52a2d0a..79f7e73cc 100755 --- a/test/testElliptic.py +++ b/test/testElliptic.py @@ -4,7 +4,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal @@ -266,7 +266,7 @@ def main(): cmd=ellipticBin, settings=ellipticSettings(element=6,data_file=ellipticData3D,dim=3, precon="NONE", discretization="IPDG"), - referenceNorm=0.353553400508458) + referenceNorm=0.353553400119087) failCount += test(name="testEllipticHex_Ipdg", cmd=ellipticBin, @@ -394,7 +394,7 @@ def main(): settings=ellipticSettings(element=6,data_file=ellipticData3D,dim=3, boundary_flag=-1, Lambda=0.0, discretization="IPDG"), - referenceNorm=0.0595408371412352) + referenceNorm=0.0595408272243646) failCount += test(name="testEllipticHex_Ipdg_AllNeumann", cmd=ellipticBin, diff --git a/test/testFokkerPlanck.py b/test/testFokkerPlanck.py index 7dd989019..43c42e069 100755 --- a/test/testFokkerPlanck.py +++ b/test/testFokkerPlanck.py @@ -4,7 +4,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal diff --git a/test/testGradient.py b/test/testGradient.py index ff00b56bd..c170a878c 100755 --- a/test/testGradient.py +++ b/test/testGradient.py @@ -4,7 +4,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal @@ -34,6 +34,7 @@ def gradientSettings(rcformat="2.0", data_file=gradientData2D, mesh="BOX", dim=2, element=4, nx=10, ny=10, nz=10, boundary_flag=1, degree=4, thread_model=device, platform_number=0, device_number=0, + paradogs_partitioning="NONE", output_to_file="FALSE"): return [setting_t("FORMAT", rcformat), setting_t("DATA FILE", data_file), @@ -48,6 +49,7 @@ def gradientSettings(rcformat="2.0", data_file=gradientData2D, setting_t("THREAD MODEL", thread_model), setting_t("PLATFORM NUMBER", platform_number), setting_t("DEVICE NUMBER", device_number), + setting_t("PARADOGS PARTITIONING", paradogs_partitioning), setting_t("OUTPUT TO FILE", output_to_file)] def main(): @@ -83,4 +85,4 @@ def main(): if __name__ == "__main__": failCount=0; failCount+=main() - sys.exit(failCount) \ No newline at end of file + sys.exit(failCount) diff --git a/test/testInitialGuess.py b/test/testInitialGuess.py index 0457af9e6..11836c053 100755 --- a/test/testInitialGuess.py +++ b/test/testInitialGuess.py @@ -4,7 +4,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal diff --git a/test/testIns.py b/test/testIns.py index fad570f28..72659b662 100755 --- a/test/testIns.py +++ b/test/testIns.py @@ -4,7 +4,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal diff --git a/test/testLbs.py b/test/testLbs.py old mode 100644 new mode 100755 index bb0bb6a80..842f5ffff --- a/test/testLbs.py +++ b/test/testLbs.py @@ -4,7 +4,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal diff --git a/test/testLinearSolver.py b/test/testLinearSolver.py index 522cd33c7..f899eedc6 100755 --- a/test/testLinearSolver.py +++ b/test/testLinearSolver.py @@ -4,7 +4,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal diff --git a/test/testMesh.py b/test/testMesh.py index b9de9054e..b356db09d 100755 --- a/test/testMesh.py +++ b/test/testMesh.py @@ -4,7 +4,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal @@ -118,4 +118,4 @@ def main(): if __name__ == "__main__": failCount=0; failCount+=main() - sys.exit(failCount) \ No newline at end of file + sys.exit(failCount) diff --git a/test/testParAdogs.py b/test/testParAdogs.py new file mode 100755 index 000000000..269582268 --- /dev/null +++ b/test/testParAdogs.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 + +##################################################################################### +# +#The MIT License (MIT) +# +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +# +#Permission is hereby granted, free of charge, to any person obtaining a copy +#of this software and associated documentation files (the "Software"), to deal +#in the Software without restriction, including without limitation the rights +#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +#copies of the Software, and to permit persons to whom the Software is +#furnished to do so, subject to the following conditions: +# +#The above copyright notice and this permission notice shall be included in all +#copies or substantial portions of the Software. +# +#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +#SOFTWARE. +# +##################################################################################### + +from test import * +from testGradient import * + +def main(): + failCount=0; + + failCount += test(name="testParAdogsTri_Inertial_MPI", ranks=2, + cmd=gradientBin, + settings=gradientSettings(element=3,data_file=gradientData2D,dim=2, + mesh=testDir+"/squareTri.msh", + paradogs_partitioning="INERTIAL"), + referenceNorm=0.580787485719841) + + failCount += test(name="testParAdogsQuad_Inertial_MPI", ranks=2, + cmd=gradientBin, + settings=gradientSettings(element=4,data_file=gradientData2D,dim=2, + mesh=testDir+"/squareQuad.msh", + paradogs_partitioning="INERTIAL"), + referenceNorm=0.580787485654967) + + failCount += test(name="testParAdogsTet_Inertial_MPI", ranks=2, + cmd=gradientBin, + settings=gradientSettings(element=6,data_file=gradientData3D,dim=3, + mesh=testDir+"/cubeTet.msh", + paradogs_partitioning="INERTIAL"), + referenceNorm=0.942816947760423) + + failCount += test(name="testParAdogsHex_Inertial_MPI", ranks=2, + cmd=gradientBin, + settings=gradientSettings(element=12,data_file=gradientData3D,dim=3, + mesh=testDir+"/cubeHex.msh", + paradogs_partitioning="INERTIAL"), + referenceNorm=0.942816869518335) + + failCount += test(name="testParAdogsTri_Spectral_MPI", ranks=2, + cmd=gradientBin, + settings=gradientSettings(element=3,data_file=gradientData2D,dim=2, + mesh=testDir+"/squareTri.msh", + paradogs_partitioning="SPECTRAL"), + referenceNorm=0.580787485719841) + + failCount += test(name="testParAdogsQuad_Spectral_MPI", ranks=2, + cmd=gradientBin, + settings=gradientSettings(element=4,data_file=gradientData2D,dim=2, + mesh=testDir+"/squareQuad.msh", + paradogs_partitioning="SPECTRAL"), + referenceNorm=0.580787485654967) + + failCount += test(name="testParAdogsTet_Spectral_MPI", ranks=2, + cmd=gradientBin, + settings=gradientSettings(element=6,data_file=gradientData3D,dim=3, + mesh=testDir+"/cubeTet.msh", + paradogs_partitioning="SPECTRAL"), + referenceNorm=0.942816947760423) + + failCount += test(name="testParAdogsHex_Spectral_MPI", ranks=2, + cmd=gradientBin, + settings=gradientSettings(element=12,data_file=gradientData3D,dim=3, + mesh=testDir+"/cubeHex.msh", + paradogs_partitioning="SPECTRAL"), + referenceNorm=0.942816869518335) + + return failCount + +if __name__ == "__main__": + failCount=0; + failCount+=main() + sys.exit(failCount) diff --git a/test/testParAlmond.py b/test/testParAlmond.py index d251f65c8..3c41249a6 100755 --- a/test/testParAlmond.py +++ b/test/testParAlmond.py @@ -4,7 +4,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal @@ -105,4 +105,4 @@ def main(): if __name__ == "__main__": failCount=0; failCount+=main() - sys.exit(failCount) \ No newline at end of file + sys.exit(failCount) diff --git a/test/testTimeStepper.py b/test/testTimeStepper.py index b7af95feb..edf20ab4b 100755 --- a/test/testTimeStepper.py +++ b/test/testTimeStepper.py @@ -4,7 +4,7 @@ # #The MIT License (MIT) # -#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal @@ -117,4 +117,4 @@ def main(): if __name__ == "__main__": failCount=0; failCount+=main() - sys.exit(failCount) \ No newline at end of file + sys.exit(failCount)