From 17e523b10adbe3c7028e636c48825aea3208c5c7 Mon Sep 17 00:00:00 2001
From: august-knox <knox10@llnl.gov>
Date: Thu, 14 Nov 2024 09:20:39 -0800
Subject: [PATCH 01/24] including extra directory in makefile

---
 makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/makefile b/makefile
index 81957b7..7cc1314 100644
--- a/makefile
+++ b/makefile
@@ -102,7 +102,8 @@ ifeq ($(REMHOS_DEBUG),YES)
 endif
 
 LIBS = $(strip $(REMHOS_LIBS) $(LDFLAGS))
-CCC  = $(strip $(CXX) $(REMHOS_FLAGS))
+EXTRA_INC_DIR = $(or $(wildcard $(MFEM_DIR)/include/mfem),$(MFEM_DIR))
+CCC  = $(strip $(CXX) $(REMHOS_FLAGS) $(if $(EXTRA_INC_DIR),-I$(EXTRA_INC_DIR)))
 Ccc  = $(strip $(CC) $(CFLAGS) $(GL_OPTS))
 
 SOURCE_FILES = remhos.cpp remhos_tools.cpp remhos_lo.cpp remhos_ho.cpp \

From 105b06dbe634f7a16fb7c3ef630e24da1b2c9e3d Mon Sep 17 00:00:00 2001
From: august-knox <knox10@llnl.gov>
Date: Thu, 14 Nov 2024 15:11:46 -0800
Subject: [PATCH 02/24] initial commit of caliper

---
 makefile   | 15 +++++++++++++--
 remhos.cpp | 50 +++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 62 insertions(+), 3 deletions(-)

diff --git a/makefile b/makefile
index 7cc1314..821c94b 100644
--- a/makefile
+++ b/makefile
@@ -64,6 +64,16 @@ TEST_MK = $(MFEM_DIR)/config/test.mk
 MFEM_DIR1 := $(MFEM_DIR)
 MFEM_DIR2 := $(realpath $(MFEM_DIR))
 
+# Caliper/Adiak flags
+
+CALIPER_DIR = $(spack location --install-dir caliper)
+ADIAK_DIR = $(spack location --install-dir adiak)
+CALIPER_FLAGS = -I${CALIPER_DIR}/include -DUSE_CALIPER
+ADIAK_INCLUDE = -I${ADIAK_DIR}/include 
+ADIAK_LDFLAGS =  -L${ADIAK_DIR}/lib -ladiak
+CALIPER_LDFLAGS =  -L${CALIPER_DIR}/lib64 -lcaliper 
+
+
 # Use the compiler used by MFEM. Get the compiler and the options for compiling
 # and linking from MFEM's config.mk. (Skip this if the target does not require
 # building.)
@@ -73,7 +83,7 @@ ifeq (,$(filter help clean distclean style,$(MAKECMDGOALS)))
 endif
 
 CXX = $(MFEM_CXX)
-CPPFLAGS = $(MFEM_CPPFLAGS)
+CPPFLAGS = $(MFEM_CPPFLAGS) $(CALIPER_FLAGS) $(ADIAK_FLAGS)
 CXXFLAGS = $(MFEM_CXXFLAGS)
 
 # MFEM config does not define C compiler
@@ -81,7 +91,8 @@ CC     = gcc
 CFLAGS = -O3
 
 # Optional link flags
-LDFLAGS =
+LDFLAGS = $(CALIPER_LDFLAGS) $(ADIAK_LDFLAGS)
+
 
 OPTIM_OPTS = -O3
 DEBUG_OPTS = -g -Wall -std=c++11
diff --git a/remhos.cpp b/remhos.cpp
index 977d5d2..81d5ec5 100644
--- a/remhos.cpp
+++ b/remhos.cpp
@@ -39,6 +39,13 @@
 #include "remhos_tools.hpp"
 #include "remhos_sync.hpp"
 
+#ifdef USE_CALIPER
+#include <caliper/cali.h>
+#include <caliper/cali-manager.h>
+#ifdef HAVE_MPI
+#include <caliper/cali-mpi.h>
+#endif
+
 using namespace std;
 using namespace mfem;
 
@@ -145,6 +152,19 @@ int main(int argc, char *argv[])
    mfem::MPI_Session mpi(argc, argv);
    const int myid = mpi.WorldRank();
 
+#ifdef USE_CALIPER
+    setupCaliper();
+
+    cali::ConfigManager calimgr(params.simulationParams.caliperConfig.c_str());
+    if (calimgr.error())
+        std::cerr << "caliper config error: " << calimgr.error_msg() << std::endl;
+    calimgr.start();
+    adiak::init(nullptr);
+    adiak::cmdline();
+    adiak::hostname();
+    
+#endif
+
    const char *mesh_file = "data/periodic-square.mesh";
    int rs_levels = 2;
    int rp_levels = 0;
@@ -368,14 +388,24 @@ int main(int argc, char *argv[])
       v.ProjectCoefficient(vcoeff);
 
       double t = 0.0;
+#ifdef USE_CALIPER
+      CALI_CXX_MARK_LOOP_BEGIN(mainloop, "rem.mainloop");
+#endif
       while (t < t_final)
       {
+#ifdef USE_CALIPER
+         CALI_CXX_MARK_LOOP_ITERATION(mainloop, t);
+#endif
          t += dt;
          // Move the mesh nodes.
          x.Add(std::min(dt, t_final-t), v);
          // Update the node velocities.
          v.ProjectCoefficient(vcoeff);
       }
+#ifdef USE_CALIPER
+      CALI_CXX_MARK_LOOP_END(mainloop);
+#endif
+
 
       // Pseudotime velocity.
       add(x, -1.0, x0, v_gf);
@@ -1246,7 +1276,9 @@ int main(int argc, char *argv[])
       delete lom.SubFes1;
       delete lom.VolumeTerms;
    }
-
+#ifdef USE_CALIPER
+   calimgr.flush();
+#endif
    return 0;
 }
 
@@ -1925,3 +1957,19 @@ double inflow_function(const Vector &x)
    }
    else { return 0.0; }
 }
+
+void setupCaliper()
+{
+#ifdef USE_CALIPER
+#ifdef HAVE_MPI
+   cali_mpi_init();
+#endif
+
+   cali_config_preset("CALI_LOG_VERBOSITY", "0");
+   cali_config_preset("CALI_CALIPER_ATTRIBUTE_DEFAULT_SCOPE", "process");
+
+   cali_set_global_string_byname("rem.git_vers", GIT_VERS);
+   cali_set_global_string_byname("rem.git_hash", GIT_HASH);
+#endif
+}
+

From 35f46f23498e54241a7fc464e3346a96ac84eace Mon Sep 17 00:00:00 2001
From: august-knox <knox10@llnl.gov>
Date: Thu, 14 Nov 2024 16:06:38 -0800
Subject: [PATCH 03/24] adding adiak import

---
 remhos.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/remhos.cpp b/remhos.cpp
index 81d5ec5..135a975 100644
--- a/remhos.cpp
+++ b/remhos.cpp
@@ -42,9 +42,11 @@
 #ifdef USE_CALIPER
 #include <caliper/cali.h>
 #include <caliper/cali-manager.h>
+#include <adiak.hpp>
 #ifdef HAVE_MPI
 #include <caliper/cali-mpi.h>
 #endif
+#endif
 
 using namespace std;
 using namespace mfem;

From b3672a817ac19c5c290cc20f04e1ea0623a6905a Mon Sep 17 00:00:00 2001
From: august-knox <knox10@llnl.gov>
Date: Thu, 14 Nov 2024 16:18:23 -0800
Subject: [PATCH 04/24] fix?

---
 remhos.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/remhos.cpp b/remhos.cpp
index 135a975..d3b0720 100644
--- a/remhos.cpp
+++ b/remhos.cpp
@@ -155,15 +155,15 @@ int main(int argc, char *argv[])
    const int myid = mpi.WorldRank();
 
 #ifdef USE_CALIPER
-    setupCaliper();
-
-    cali::ConfigManager calimgr(params.simulationParams.caliperConfig.c_str());
-    if (calimgr.error())
-        std::cerr << "caliper config error: " << calimgr.error_msg() << std::endl;
-    calimgr.start();
-    adiak::init(nullptr);
-    adiak::cmdline();
-    adiak::hostname();
+   setupCaliper();
+
+   cali::ConfigManager calimgr(params.simulationParams.caliperConfig.c_str());
+   if (calimgr.error())
+       std::cerr << "caliper config error: " << calimgr.error_msg() << std::endl;
+   calimgr.start();
+   adiak::init(nullptr);
+   adiak::cmdline();
+   adiak::hostname();
     
 #endif
 
@@ -1970,8 +1970,8 @@ void setupCaliper()
    cali_config_preset("CALI_LOG_VERBOSITY", "0");
    cali_config_preset("CALI_CALIPER_ATTRIBUTE_DEFAULT_SCOPE", "process");
 
-   cali_set_global_string_byname("rem.git_vers", GIT_VERS);
-   cali_set_global_string_byname("rem.git_hash", GIT_HASH);
+   //cali_set_global_string_byname("rem.git_vers", GIT_VERS);
+   //cali_set_global_string_byname("rem.git_hash", GIT_HASH);
 #endif
 }
 

From 2a1494e1ad14e8d354e64862aea1aa0d7b6f930b Mon Sep 17 00:00:00 2001
From: august-knox <knox10@llnl.gov>
Date: Thu, 14 Nov 2024 16:24:06 -0800
Subject: [PATCH 05/24] adding instantiation

---
 remhos.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/remhos.cpp b/remhos.cpp
index d3b0720..92961ce 100644
--- a/remhos.cpp
+++ b/remhos.cpp
@@ -81,6 +81,9 @@ double inflow_function(const Vector &x);
 // Mesh bounding box
 Vector bb_min, bb_max;
 
+//Caliper setup
+void setupCaliper();
+
 class AdvectionOperator : public TimeDependentOperator
 {
 private:

From a8b8bf9e3d3e0df8f01c98797575ccdc0d3c46a7 Mon Sep 17 00:00:00 2001
From: august-knox <knox10@llnl.gov>
Date: Thu, 14 Nov 2024 16:33:57 -0800
Subject: [PATCH 06/24] moving configmanager

---
 remhos.cpp | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/remhos.cpp b/remhos.cpp
index 92961ce..5e31aca 100644
--- a/remhos.cpp
+++ b/remhos.cpp
@@ -157,19 +157,6 @@ int main(int argc, char *argv[])
    mfem::MPI_Session mpi(argc, argv);
    const int myid = mpi.WorldRank();
 
-#ifdef USE_CALIPER
-   setupCaliper();
-
-   cali::ConfigManager calimgr(params.simulationParams.caliperConfig.c_str());
-   if (calimgr.error())
-       std::cerr << "caliper config error: " << calimgr.error_msg() << std::endl;
-   calimgr.start();
-   adiak::init(nullptr);
-   adiak::cmdline();
-   adiak::hostname();
-    
-#endif
-
    const char *mesh_file = "data/periodic-square.mesh";
    int rs_levels = 2;
    int rp_levels = 0;
@@ -281,6 +268,20 @@ int main(int argc, char *argv[])
    }
    if (myid == 0) { args.PrintOptions(cout); }
 
+//setup caliper config manager
+#ifdef USE_CALIPER
+   setupCaliper();
+
+   cali::ConfigManager calimgr(params.Parse().caliperConfig.c_str());
+   if (calimgr.error())
+       std::cerr << "caliper config error: " << calimgr.error_msg() << std::endl;
+   calimgr.start();
+   adiak::init(nullptr);
+   adiak::cmdline();
+   adiak::hostname();
+    
+#endif
+
    // Enable hardware devices such as GPUs, and programming models such as
    // CUDA, OCCA, RAJA and OpenMP based on command line options.
    Device device(device_config);

From 9b31e0c6a10aaa82699595125a94ca51c73bd50f Mon Sep 17 00:00:00 2001
From: august-knox <knox10@llnl.gov>
Date: Thu, 14 Nov 2024 16:38:48 -0800
Subject: [PATCH 07/24] fixing calimanager

---
 remhos.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/remhos.cpp b/remhos.cpp
index 5e31aca..6c5eb89 100644
--- a/remhos.cpp
+++ b/remhos.cpp
@@ -272,7 +272,7 @@ int main(int argc, char *argv[])
 #ifdef USE_CALIPER
    setupCaliper();
 
-   cali::ConfigManager calimgr(params.Parse().caliperConfig.c_str());
+   cali::ConfigManager calimgr;
    if (calimgr.error())
        std::cerr << "caliper config error: " << calimgr.error_msg() << std::endl;
    calimgr.start();

From 38ac1acab0bdc610925362803c9ad43a24b9777b Mon Sep 17 00:00:00 2001
From: august-knox <knox10@llnl.gov>
Date: Thu, 19 Dec 2024 11:56:35 -0800
Subject: [PATCH 08/24] rermoving unnecessary adiak import

---
 remhos.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/remhos.cpp b/remhos.cpp
index c54314b..d3b4198 100644
--- a/remhos.cpp
+++ b/remhos.cpp
@@ -42,7 +42,7 @@
 #ifdef USE_CALIPER
 #include <caliper/cali.h>
 #include <caliper/cali-manager.h>
-#include <adiak.hpp>
+//#include <adiak.hpp>
 #ifdef HAVE_MPI
 #include <caliper/cali-mpi.h>
 #endif

From e27e71f6e746acae87198d6159c93a99f3a66160 Mon Sep 17 00:00:00 2001
From: august-knox <knox10@llnl.gov>
Date: Thu, 19 Dec 2024 12:11:56 -0800
Subject: [PATCH 09/24] rmoving problematic adiak line

---
 makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/makefile b/makefile
index 821c94b..1c81b68 100644
--- a/makefile
+++ b/makefile
@@ -67,7 +67,7 @@ MFEM_DIR2 := $(realpath $(MFEM_DIR))
 # Caliper/Adiak flags
 
 CALIPER_DIR = $(spack location --install-dir caliper)
-ADIAK_DIR = $(spack location --install-dir adiak)
+#ADIAK_DIR = $(spack location --install-dir adiak)
 CALIPER_FLAGS = -I${CALIPER_DIR}/include -DUSE_CALIPER
 ADIAK_INCLUDE = -I${ADIAK_DIR}/include 
 ADIAK_LDFLAGS =  -L${ADIAK_DIR}/lib -ladiak

From 1ba87a0921e943d6cfc9354472e937457d771b26 Mon Sep 17 00:00:00 2001
From: august-knox <knox10@llnl.gov>
Date: Thu, 19 Dec 2024 12:31:44 -0800
Subject: [PATCH 10/24] removing adiak annotations

---
 remhos.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/remhos.cpp b/remhos.cpp
index d3b4198..f00e841 100644
--- a/remhos.cpp
+++ b/remhos.cpp
@@ -42,7 +42,7 @@
 #ifdef USE_CALIPER
 #include <caliper/cali.h>
 #include <caliper/cali-manager.h>
-//#include <adiak.hpp>
+#include <adiak.hpp>
 #ifdef HAVE_MPI
 #include <caliper/cali-mpi.h>
 #endif
@@ -297,9 +297,9 @@ MFEM_EXPORT int remhos(int argc, char *argv[], double &final_mass_u)
    if (calimgr.error())
        std::cerr << "caliper config error: " << calimgr.error_msg() << std::endl;
    calimgr.start();
-   adiak::init(nullptr);
-   adiak::cmdline();
-   adiak::hostname();
+   //adiak::init(nullptr);
+   //adiak::cmdline();
+   //adiak::hostname();
     
 #endif
 

From 91b1cb5818cd8d4ccb166b0b0f23a73b0c295588 Mon Sep 17 00:00:00 2001
From: august-knox <knox10@llnl.gov>
Date: Thu, 19 Dec 2024 12:35:24 -0800
Subject: [PATCH 11/24] removing other import

---
 remhos.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/remhos.cpp b/remhos.cpp
index f00e841..d1f239a 100644
--- a/remhos.cpp
+++ b/remhos.cpp
@@ -42,7 +42,7 @@
 #ifdef USE_CALIPER
 #include <caliper/cali.h>
 #include <caliper/cali-manager.h>
-#include <adiak.hpp>
+//#include <adiak.hpp>
 #ifdef HAVE_MPI
 #include <caliper/cali-mpi.h>
 #endif

From 41d5217deed46539bbd82b9ee3f1f75436a45d3f Mon Sep 17 00:00:00 2001
From: august-knox <knox10@llnl.gov>
Date: Thu, 19 Dec 2024 14:43:56 -0800
Subject: [PATCH 12/24] updating remhos_tools

---
 remhos_tools.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/remhos_tools.hpp b/remhos_tools.hpp
index 6b29897..6bb2f63 100644
--- a/remhos_tools.hpp
+++ b/remhos_tools.hpp
@@ -90,7 +90,7 @@ class SmoothnessIndicator
    ~SmoothnessIndicator();
 
    void ComputeSmoothnessIndicator(const Vector &u, ParGridFunction &si_vals_u);
-   void UpdateBounds(int dof_id, double u_HO,
+   __host__ __device__ void UpdateBounds(int dof_id, double u_HO,
                      const ParGridFunction &si_vals,
                      double &u_min, double &u_max);
 

From 6b571d36f5af49654eb2d0c5ba72cf02788f73bb Mon Sep 17 00:00:00 2001
From: august-knox <knox10@llnl.gov>
Date: Thu, 19 Dec 2024 14:46:55 -0800
Subject: [PATCH 13/24] updating class as well

---
 remhos_tools.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/remhos_tools.cpp b/remhos_tools.cpp
index 4422eb2..77db06b 100644
--- a/remhos_tools.cpp
+++ b/remhos_tools.cpp
@@ -180,7 +180,7 @@ void SmoothnessIndicator::ComputeSmoothnessIndicator(const Vector &u,
    }
 }
 
-void SmoothnessIndicator::UpdateBounds(int dof_id, double u_HO,
+__host__ __device__ void SmoothnessIndicator::UpdateBounds(int dof_id, double u_HO,
                                        const ParGridFunction &si_vals,
                                        double &u_min, double &u_max)
 {

From 93085fa350383bf5ff73553f077c6ec8f11b4a71 Mon Sep 17 00:00:00 2001
From: august-knox <knox10@llnl.gov>
Date: Thu, 19 Dec 2024 14:52:04 -0800
Subject: [PATCH 14/24] undoing changes

---
 remhos_tools.cpp | 2 +-
 remhos_tools.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/remhos_tools.cpp b/remhos_tools.cpp
index 77db06b..4422eb2 100644
--- a/remhos_tools.cpp
+++ b/remhos_tools.cpp
@@ -180,7 +180,7 @@ void SmoothnessIndicator::ComputeSmoothnessIndicator(const Vector &u,
    }
 }
 
-__host__ __device__ void SmoothnessIndicator::UpdateBounds(int dof_id, double u_HO,
+void SmoothnessIndicator::UpdateBounds(int dof_id, double u_HO,
                                        const ParGridFunction &si_vals,
                                        double &u_min, double &u_max)
 {
diff --git a/remhos_tools.hpp b/remhos_tools.hpp
index 6bb2f63..6b29897 100644
--- a/remhos_tools.hpp
+++ b/remhos_tools.hpp
@@ -90,7 +90,7 @@ class SmoothnessIndicator
    ~SmoothnessIndicator();
 
    void ComputeSmoothnessIndicator(const Vector &u, ParGridFunction &si_vals_u);
-   __host__ __device__ void UpdateBounds(int dof_id, double u_HO,
+   void UpdateBounds(int dof_id, double u_HO,
                      const ParGridFunction &si_vals,
                      double &u_min, double &u_max);
 

From 0690282ec45dc94af949bc03e1f06b1981b8ebba Mon Sep 17 00:00:00 2001
From: august-knox <knox10@llnl.gov>
Date: Mon, 6 Jan 2025 12:31:28 -0800
Subject: [PATCH 15/24] make caliper optional

---
 makefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/makefile b/makefile
index 1c81b68..2d888c7 100644
--- a/makefile
+++ b/makefile
@@ -66,12 +66,14 @@ MFEM_DIR2 := $(realpath $(MFEM_DIR))
 
 # Caliper/Adiak flags
 
-CALIPER_DIR = $(spack location --install-dir caliper)
+#CALIPER_DIR = $(spack location --install-dir caliper)
 #ADIAK_DIR = $(spack location --install-dir adiak)
+ifeq ($(CALIPER_DIR),)
 CALIPER_FLAGS = -I${CALIPER_DIR}/include -DUSE_CALIPER
 ADIAK_INCLUDE = -I${ADIAK_DIR}/include 
 ADIAK_LDFLAGS =  -L${ADIAK_DIR}/lib -ladiak
 CALIPER_LDFLAGS =  -L${CALIPER_DIR}/lib64 -lcaliper 
+endif
 
 
 # Use the compiler used by MFEM. Get the compiler and the options for compiling

From ca08bf8b10c85c2da799b8afc01e2ad84be9aaaf Mon Sep 17 00:00:00 2001
From: august-knox <knox10@llnl.gov>
Date: Mon, 6 Jan 2025 14:48:09 -0800
Subject: [PATCH 16/24] making caliper optional

---
 makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/makefile b/makefile
index 2d888c7..af825f5 100644
--- a/makefile
+++ b/makefile
@@ -68,7 +68,7 @@ MFEM_DIR2 := $(realpath $(MFEM_DIR))
 
 #CALIPER_DIR = $(spack location --install-dir caliper)
 #ADIAK_DIR = $(spack location --install-dir adiak)
-ifeq ($(CALIPER_DIR),)
+ifdef CALIPER_DIR
 CALIPER_FLAGS = -I${CALIPER_DIR}/include -DUSE_CALIPER
 ADIAK_INCLUDE = -I${ADIAK_DIR}/include 
 ADIAK_LDFLAGS =  -L${ADIAK_DIR}/lib -ladiak

From ad75451c0c190f556314a09456818f3b3f155f63 Mon Sep 17 00:00:00 2001
From: august-knox <knox10@llnl.gov>
Date: Mon, 6 Jan 2025 15:52:08 -0800
Subject: [PATCH 17/24] fixing conflict

---
 remhos.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/remhos.cpp b/remhos.cpp
index 75518f7..de1e514 100644
--- a/remhos.cpp
+++ b/remhos.cpp
@@ -291,7 +291,6 @@ MFEM_EXPORT int remhos(int argc, char *argv[], double &final_mass_u)
    }
    if (myid == 0) { args.PrintOptions(cout); }
 
-<<<<<<< HEAD
 //setup caliper config manager
 #ifdef USE_CALIPER
    setupCaliper();
@@ -303,13 +302,12 @@ MFEM_EXPORT int remhos(int argc, char *argv[], double &final_mass_u)
    //adiak::init(nullptr);
    //adiak::cmdline();
    //adiak::hostname();
+#endif
     
-=======
 #ifdef REMHOS_GPU_SETUP
    MFEM_VERIFY(ho_type  == HOSolverType::LocalInverse &&
                lo_type  == LOSolverType::MassBased &&
                fct_type == FCTSolverType::ClipScale, "Wrong GPU setup.");
->>>>>>> f102edb64e5f4ddaf8898da28e9e275fe57cdf5d
 #endif
 
    // Enable hardware devices such as GPUs, and programming models such as

From 63c6bcaf6eac42a06aae53bf31aad7d76c967d2b Mon Sep 17 00:00:00 2001
From: august-knox <knox10@llnl.gov>
Date: Mon, 6 Jan 2025 15:56:02 -0800
Subject: [PATCH 18/24] adding back remhos_fct

---
 remhos_fct.cpp | 1012 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1012 insertions(+)
 create mode 100644 remhos_fct.cpp

diff --git a/remhos_fct.cpp b/remhos_fct.cpp
new file mode 100644
index 0000000..be2454b
--- /dev/null
+++ b/remhos_fct.cpp
@@ -0,0 +1,1012 @@
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
+// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
+// reserved. See files LICENSE and NOTICE for details.
+//
+// This file is part of CEED, a collection of benchmarks, miniapps, software
+// libraries and APIs for efficient high-order finite element and spectral
+// element discretizations for exascale applications. For more information and
+// source code availability see http://github.com/ceed.
+//
+// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
+// a collaborative effort of two U.S. Department of Energy organizations (Office
+// of Science and the National Nuclear Security Administration) responsible for
+// the planning and preparation of a capable exascale ecosystem, including
+// software, applications, hardware, advanced system engineering and early
+// testbed platforms, in support of the nation's exascale computing imperative.
+
+#include "remhos_fct.hpp"
+#include "remhos_tools.hpp"
+#include "remhos_sync.hpp"
+
+using namespace std;
+
+namespace mfem
+{
+
+void FCTSolver::CalcCompatibleLOProduct(const ParGridFunction &us,
+                                        const Vector &m, const Vector &d_us_HO,
+                                        Vector &s_min, Vector &s_max,
+                                        const Vector &u_new,
+                                        const Array<bool> &active_el,
+                                        const Array<bool> &active_dofs,
+                                        Vector &d_us_LO_new)
+{
+   const double eps = 1e-12;
+   int dof_id;
+
+   // Compute a compatible low-order solution.
+   const int NE = us.ParFESpace()->GetNE();
+   const int ndofs = us.Size() / NE;
+
+   Vector s_min_loc, s_max_loc;
+
+   d_us_LO_new = 0.0;
+
+   for (int k = 0; k < NE; k++)
+   {
+      if (active_el[k] == false) { continue; }
+
+      double mass_us = 0.0, mass_u = 0.0;
+      for (int j = 0; j < ndofs; j++)
+      {
+         const double us_new_HO = us(k*ndofs + j) + dt * d_us_HO(k*ndofs + j);
+         mass_us += us_new_HO * m(k*ndofs + j);
+         mass_u  += u_new(k*ndofs + j) * m(k*ndofs + j);
+      }
+      double s_avg = mass_us / mass_u;
+
+      // Min and max of s using the full stencil of active dofs.
+      s_min_loc.SetDataAndSize(s_min.GetData() + k*ndofs, ndofs);
+      s_max_loc.SetDataAndSize(s_max.GetData() + k*ndofs, ndofs);
+      double smin = numeric_limits<double>::infinity(),
+             smax = -numeric_limits<double>::infinity();
+      for (int j = 0; j < ndofs; j++)
+      {
+         if (active_dofs[k*ndofs + j] == false) { continue; }
+         smin = min(smin, s_min_loc(j));
+         smax = max(smax, s_max_loc(j));
+      }
+
+      // Fix inconsistencies due to round-off and the usage of local bounds.
+      for (int j = 0; j < ndofs; j++)
+      {
+         if (active_dofs[k*ndofs + j] == false) { continue; }
+
+         // Check if there's a violation, s_avg < s_min, due to round-offs that
+         // are inflated by the division of a small number (the 2nd check means
+         // s_avg = mass_us / mass_u > s_min up to round-off in mass_us).
+         if (s_avg < smin &&
+             mass_us + eps > smin * mass_u) { s_avg = smin; }
+         // As above for the s_max.
+         if (s_avg > smax &&
+             mass_us - eps < smax * mass_u) { s_avg = smax; }
+
+#ifdef REMHOS_FCT_PRODUCT_DEBUG
+         // Check if s_avg = mass_us / mass_u is within the bounds of the full
+         // stencil of active dofs.
+         if (mass_us + eps < smin * mass_u ||
+             mass_us - eps > smax * mass_u ||
+             s_avg + eps < smin ||
+             s_avg - eps > smax)
+         {
+            std::cout << "---\ns_avg element bounds: "
+                      << smin << " " << s_avg << " " << smax << std::endl;
+            std::cout << "Element " << k << std::endl;
+            std::cout << "Masses " << mass_us << " " << mass_u << std::endl;
+            PrintCellValues(k, NE, u_new, "u_loc: ");
+
+            MFEM_ABORT("s_avg is not in the full stencil bounds!");
+         }
+#endif
+
+         // When s_avg is not in the local bounds for some dof (it should be
+         // within the full stencil of active dofs), reset the bounds to s_avg.
+         if (s_avg + eps < s_min_loc(j)) { s_min_loc(j) = s_avg; }
+         if (s_avg - eps > s_max_loc(j)) { s_max_loc(j) = s_avg; }
+      }
+
+      // Take into account the compatible low-order solution.
+      for (int j = 0; j < ndofs; j++)
+      {
+         // In inactive dofs we get u_new*s_avg ~ 0, which should be fine.
+
+         // Compatible LO solution.
+         dof_id = k*ndofs + j;
+         d_us_LO_new(dof_id) = (u_new(dof_id) * s_avg - us(dof_id)) / dt;
+      }
+
+#ifdef REMHOS_FCT_PRODUCT_DEBUG
+      // Check the LO product solution.
+      double us_min, us_max;
+      for (int j = 0; j < ndofs; j++)
+      {
+         dof_id = k*ndofs + j;
+         if (active_dofs[dof_id] == false) { continue; }
+
+         us_min = s_min_loc(j) * u_new(dof_id);
+         us_max = s_max_loc(j) * u_new(dof_id);
+
+         if (s_avg * u_new(dof_id) + eps < us_min ||
+             s_avg * u_new(dof_id) - eps > us_max)
+         {
+            std::cout << "---\ns_avg * u: " << k << " "
+                      << us_min << " "
+                      << s_avg * u_new(dof_id) << " "
+                      << us_max << endl
+                      << u_new(dof_id) << " " << s_avg << endl
+                      << s_min_loc(j) << " " << s_max_loc(j) << "\n---\n";
+
+            MFEM_ABORT("s_avg * u not in bounds");
+         }
+      }
+#endif
+   }
+}
+
+void FCTSolver::ScaleProductBounds(const Vector &s_min, const Vector &s_max,
+                                   const Vector &u_new,
+                                   const Array<bool> &active_el,
+                                   const Array<bool> &active_dofs,
+                                   Vector &us_min, Vector &us_max)
+{
+   const int NE = pfes.GetNE();
+   const int ndofs = u_new.Size() / NE;
+   int dof_id;
+   us_min = 0.0;
+   us_max = 0.0;
+   for (int k = 0; k < NE; k++)
+   {
+      if (active_el[k] == false) { continue; }
+
+      // Rescale the bounds (s_min, s_max) -> (u*s_min, u*s_max).
+      for (int j = 0; j < ndofs; j++)
+      {
+         dof_id = k*ndofs + j;
+
+         // For inactive dofs, s_min and s_max are undefined (inf values).
+         if (active_dofs[dof_id] == false)
+         {
+            us_min(dof_id) = 0.0;
+            us_max(dof_id) = 0.0;
+            continue;
+         }
+
+         us_min(dof_id) = s_min(dof_id) * u_new(dof_id);
+         us_max(dof_id) = s_max(dof_id) * u_new(dof_id);
+      }
+   }
+}
+
+void FluxBasedFCT::CalcFCTSolution(const ParGridFunction &u, const Vector &m,
+                                   const Vector &du_ho, const Vector &du_lo,
+                                   const Vector &u_min, const Vector &u_max,
+                                   Vector &du) const
+{
+   MFEM_VERIFY(smth_indicator == NULL, "TODO: update SI bounds.");
+
+   // Construct the flux matrix (it gets recomputed every time).
+   ComputeFluxMatrix(u, du_ho, flux_ij);
+
+   // Iterated FCT correction.
+   Vector du_lo_fct(du_lo);
+   for (int fct_iter = 0; fct_iter < iter_cnt; fct_iter++)
+   {
+      // Compute sums of incoming/outgoing fluxes at each DOF.
+      AddFluxesAtDofs(flux_ij, gp, gm);
+
+      // Compute the flux coefficients (aka alphas) into gp and gm.
+      ComputeFluxCoefficients(u, du_lo_fct, m, u_min, u_max, gp, gm);
+
+      // Apply the alpha coefficients to get the final solution.
+      // Update the flux matrix for iterative FCT (when iter_cnt > 1).
+      UpdateSolutionAndFlux(du_lo_fct, m, gp, gm, flux_ij, du);
+
+      du_lo_fct = du;
+   }
+}
+
+void FluxBasedFCT::CalcFCTProduct(const ParGridFunction &us, const Vector &m,
+                                  const Vector &d_us_HO, const Vector &d_us_LO,
+                                  Vector &s_min, Vector &s_max,
+                                  const Vector &u_new,
+                                  const Array<bool> &active_el,
+                                  const Array<bool> &active_dofs, Vector &d_us)
+{
+   // Construct the flux matrix (it gets recomputed every time).
+   ComputeFluxMatrix(us, d_us_HO, flux_ij);
+
+   us.HostRead();
+   d_us_LO.HostRead();
+   s_min.HostReadWrite();
+   s_max.HostReadWrite();
+   u_new.HostRead();
+   active_el.HostRead();
+   active_dofs.HostRead();
+
+   // Compute a compatible low-order solution.
+   Vector dus_lo_fct(us.Size()), us_min(us.Size()), us_max(us.Size());
+   CalcCompatibleLOProduct(us, m, d_us_HO, s_min, s_max, u_new,
+                           active_el, active_dofs, dus_lo_fct);
+   ScaleProductBounds(s_min, s_max, u_new, active_el, active_dofs,
+                      us_min, us_max);
+
+   // Update the flux matrix to a product-compatible version.
+   // Compute a compatible low-order solution.
+   const int NE = us.ParFESpace()->GetNE();
+   const int ndofs = us.Size() / NE;
+   Vector flux_el(ndofs), beta(ndofs);
+   DenseMatrix fij_el(ndofs);
+   fij_el = 0.0;
+   Array<int> dofs;
+   int dof_id;
+   for (int k = 0; k < NE; k++)
+   {
+      if (active_el[k] == false) { continue; }
+
+      // Take into account the compatible low-order solution.
+      for (int j = 0; j < ndofs; j++)
+      {
+         // In inactive dofs we get u_new*s_avg ~ 0, which should be fine.
+
+         dof_id = k*ndofs + j;
+         flux_el(j) = m(dof_id) * dt * (d_us_LO(dof_id) - dus_lo_fct(dof_id));
+         beta(j) = m(dof_id) * u_new(dof_id);
+      }
+
+      // Make the betas sum to 1, add the new compatible fluxes.
+      beta /= beta.Sum();
+      for (int j = 1; j < ndofs; j++)
+      {
+         for (int i = 0; i < j; i++)
+         {
+            fij_el(i, j) = beta(j) * flux_el(i) - beta(i) * flux_el(j);
+         }
+      }
+      pfes.GetElementDofs(k, dofs);
+      flux_ij.AddSubMatrix(dofs, dofs, fij_el);
+   }
+
+   // Iterated FCT correction.
+   // To get the LO compatible product solution (with s_avg), just do
+   // d_us = dus_lo_fct instead of the loop below.
+   for (int fct_iter = 0; fct_iter < iter_cnt; fct_iter++)
+   {
+      // Compute sums of incoming fluxes at each DOF.
+      AddFluxesAtDofs(flux_ij, gp, gm);
+
+      // Compute the flux coefficients (aka alphas) into gp and gm.
+      ComputeFluxCoefficients(us, dus_lo_fct, m, us_min, us_max, gp, gm);
+
+      // Apply the alpha coefficients to get the final solution.
+      // Update the fluxes for iterative FCT (when iter_cnt > 1).
+      UpdateSolutionAndFlux(dus_lo_fct, m, gp, gm, flux_ij, d_us);
+
+      ZeroOutEmptyDofs(active_el, active_dofs, d_us);
+
+      dus_lo_fct = d_us;
+   }
+
+#ifdef REMHOS_FCT_PRODUCT_DEBUG
+   // Check the bounds of the final solution.
+   const double eps = 1e-12;
+   Vector us_new(d_us.Size());
+   add(1.0, us, dt, d_us, us_new);
+   for (int k = 0; k < NE; k++)
+   {
+      if (active_el[k] == false) { continue; }
+
+      for (int j = 0; j < ndofs; j++)
+      {
+         dof_id = k*ndofs + j;
+         if (active_dofs[dof_id] == false) { continue; }
+
+         double s = us_new(dof_id) / u_new(dof_id);
+         if (s + eps < s_min(dof_id) ||
+             s - eps > s_max(dof_id))
+         {
+            std::cout << "Final s " << j << " " << k << " "
+                      << s_min(dof_id) << " "
+                      << s << " "
+                      << s_max(dof_id) << std::endl;
+            std::cout << "---\n";
+         }
+
+         if (us_new(dof_id) + eps < us_min(dof_id) ||
+             us_new(dof_id) - eps > us_max(dof_id))
+         {
+            std::cout << "Final us " << j << " " << k << " "
+                      << us_min(dof_id) << " "
+                      << us_new(dof_id) << " "
+                      << us_max(dof_id) << std::endl;
+            std::cout << "---\n";
+         }
+      }
+   }
+#endif
+}
+
+void FluxBasedFCT::ComputeFluxMatrix(const ParGridFunction &u,
+                                     const Vector &du_ho,
+                                     SparseMatrix &flux_mat) const
+{
+   const int s = u.Size();
+   double *flux_data = flux_mat.HostReadWriteData();
+   flux_mat.HostReadI(); flux_mat.HostReadJ();
+   const int *K_I = K.HostReadI(), *K_J = K.HostReadJ();
+   const double *K_data = K.HostReadData();
+   const double *u_np = u.FaceNbrData().HostRead();
+   u.HostRead();
+   du_ho.HostRead();
+   for (int i = 0; i < s; i++)
+   {
+      for (int k = K_I[i]; k < K_I[i + 1]; k++)
+      {
+         int j = K_J[k];
+         if (j <= i) { continue; }
+
+         double kij  = K_data[k], kji = K_data[K_smap[k]];
+         double dij  = max(max(0.0, -kij), -kji);
+         double u_ij = (j < s) ? u(i) - u(j)
+                       : u(i) - u_np[j - s];
+
+         flux_data[k] = dt * dij * u_ij;
+      }
+   }
+
+   const int NE = pfes.GetMesh()->GetNE();
+   const int ndof = s / NE;
+   Array<int> dofs;
+   DenseMatrix Mz(ndof);
+   Vector du_z(ndof);
+   for (int k = 0; k < NE; k++)
+   {
+      pfes.GetElementDofs(k, dofs);
+      M.GetSubMatrix(dofs, dofs, Mz);
+      du_ho.GetSubVector(dofs, du_z);
+      for (int i = 0; i < ndof; i++)
+      {
+         int j = 0;
+         for (; j <= i; j++) { Mz(i, j) = 0.0; }
+         for (; j < ndof; j++) { Mz(i, j) *= dt * (du_z(i) - du_z(j)); }
+      }
+      flux_mat.AddSubMatrix(dofs, dofs, Mz, 0);
+   }
+}
+
+// Compute sums of incoming fluxes for every DOF.
+void FluxBasedFCT::AddFluxesAtDofs(const SparseMatrix &flux_mat,
+                                   Vector &flux_pos, Vector &flux_neg) const
+{
+   const int s = flux_pos.Size();
+   const double *flux_data = flux_mat.GetData();
+   const int *flux_I = flux_mat.GetI(), *flux_J = flux_mat.GetJ();
+   flux_pos = 0.0;
+   flux_neg = 0.0;
+   flux_pos.HostReadWrite();
+   flux_neg.HostReadWrite();
+   for (int i = 0; i < s; i++)
+   {
+      for (int k = flux_I[i]; k < flux_I[i + 1]; k++)
+      {
+         int j = flux_J[k];
+
+         // The skipped fluxes will be added when the outer loop is at j as
+         // the flux matrix is always symmetric.
+         if (j <= i) { continue; }
+
+         const double f_ij = flux_data[k];
+
+         if (f_ij >= 0.0)
+         {
+            flux_pos(i) += f_ij;
+            // Modify j if it's on the same MPI task (prevents x2 counting).
+            if (j < s) { flux_neg(j) -= f_ij; }
+         }
+         else
+         {
+            flux_neg(i) += f_ij;
+            // Modify j if it's on the same MPI task (prevents x2 counting).
+            if (j < s) { flux_pos(j) -= f_ij; }
+         }
+      }
+   }
+}
+
+// Compute the so-called alpha coefficients that scale the fluxes into gp, gm.
+void FluxBasedFCT::
+ComputeFluxCoefficients(const Vector &u, const Vector &du_lo, const Vector &m,
+                        const Vector &u_min, const Vector &u_max,
+                        Vector &coeff_pos, Vector &coeff_neg) const
+{
+   const int s = u.Size();
+   for (int i = 0; i < s; i++)
+   {
+      const double u_lo = u(i) + dt * du_lo(i);
+      const double max_pos_diff = max((u_max(i) - u_lo) * m(i), 0.0),
+                   min_neg_diff = min((u_min(i) - u_lo) * m(i), 0.0);
+      const double sum_pos = coeff_pos(i), sum_neg = coeff_neg(i);
+
+      coeff_pos(i) = (sum_pos > max_pos_diff) ? max_pos_diff / sum_pos : 1.0;
+      coeff_neg(i) = (sum_neg < min_neg_diff) ? min_neg_diff / sum_neg : 1.0;
+   }
+}
+
+void FluxBasedFCT::
+UpdateSolutionAndFlux(const Vector &du_lo, const Vector &m,
+                      ParGridFunction &coeff_pos, ParGridFunction &coeff_neg,
+                      SparseMatrix &flux_mat, Vector &du) const
+{
+   Vector &a_pos_n = coeff_pos.FaceNbrData();
+   Vector &a_neg_n = coeff_neg.FaceNbrData();
+   coeff_pos.ExchangeFaceNbrData();
+   coeff_neg.ExchangeFaceNbrData();
+
+   du = du_lo;
+
+   coeff_pos.HostReadWrite();
+   coeff_neg.HostReadWrite();
+   du.HostReadWrite();
+
+   double *flux_data = flux_mat.HostReadWriteData();
+   const int *flux_I = flux_mat.HostReadI(), *flux_J = flux_mat.HostReadJ();
+   const int s = du.Size();
+   for (int i = 0; i < s; i++)
+   {
+      for (int k = flux_I[i]; k < flux_I[i + 1]; k++)
+      {
+         int j = flux_J[k];
+         if (j <= i) { continue; }
+
+         double fij = flux_data[k], a_ij;
+         if (fij >= 0.0)
+         {
+            a_ij = (j < s) ? min(coeff_pos(i), coeff_neg(j))
+                   : min(coeff_pos(i), a_neg_n(j - s));
+         }
+         else
+         {
+            a_ij = (j < s) ? min(coeff_neg(i), coeff_pos(j))
+                   : min(coeff_neg(i), a_pos_n(j - s));
+         }
+         fij *= a_ij;
+
+         du(i) += fij / m(i) / dt;
+         if (j < s) { du(j) -= fij / m(j) / dt; }
+
+         flux_data[k] -= fij;
+      }
+   }
+}
+
+
+void ClipScaleSolver::CalcFCTSolution(const ParGridFunction &u, const Vector &m,
+                                      const Vector &du_ho, const Vector &du_lo,
+                                      const Vector &u_min, const Vector &u_max,
+                                      Vector &du) const
+{
+   const int NE = pfes.GetMesh()->GetNE();
+   const int nd = pfes.GetFE(0)->GetDof();
+   Vector f_clip(nd);
+
+   int dof_id;
+   double sumPos, sumNeg, u_new_ho, u_new_lo, new_mass, f_clip_min, f_clip_max;
+   double umin, umax;
+   const double eps = 1.0e-15;
+
+   // Smoothness indicator.
+   ParGridFunction si_val;
+   if (smth_indicator)
+   {
+      smth_indicator->ComputeSmoothnessIndicator(u, si_val);
+   }
+
+   u.HostRead();
+   m.HostRead();
+   du.HostReadWrite();
+   du_lo.HostRead(); du_ho.HostRead();
+   for (int k = 0; k < NE; k++)
+   {
+      sumPos = sumNeg = 0.0;
+
+      // Clip.
+      for (int j = 0; j < nd; j++)
+      {
+         dof_id = k*nd+j;
+
+         u_new_ho   = u(dof_id) + dt * du_ho(dof_id);
+         u_new_lo   = u(dof_id) + dt * du_lo(dof_id);
+
+         umin = u_min(dof_id);
+         umax = u_max(dof_id);
+         if (smth_indicator)
+         {
+            smth_indicator->UpdateBounds(dof_id, u_new_ho, si_val, umin, umax);
+         }
+
+         f_clip_min = m(dof_id) / dt * (umin - u_new_lo);
+         f_clip_max = m(dof_id) / dt * (umax - u_new_lo);
+
+         f_clip(j) = m(dof_id) * (du_ho(dof_id) - du_lo(dof_id));
+         f_clip(j) = min(f_clip_max, max(f_clip_min, f_clip(j)));
+
+         sumNeg += min(f_clip(j), 0.0);
+         sumPos += max(f_clip(j), 0.0);
+      }
+
+      new_mass = sumNeg + sumPos;
+
+      // Rescale.
+      for (int j = 0; j < nd; j++)
+      {
+         if (new_mass > eps)
+         {
+            f_clip(j) = min(0.0, f_clip(j)) -
+                        max(0.0, f_clip(j)) * sumNeg / sumPos;
+         }
+         if (new_mass < -eps)
+         {
+            f_clip(j) = max(0.0, f_clip(j)) -
+                        min(0.0, f_clip(j)) * sumPos / sumNeg;
+         }
+
+         // Set du to the discrete time derivative featuring the high order
+         // anti-diffusive reconstruction that leads to an forward Euler
+         // updated admissible solution.
+         dof_id = k*nd+j;
+         du(dof_id) = du_lo(dof_id) + f_clip(j) / m(dof_id);
+      }
+   }
+}
+
+void ClipScaleSolver::CalcFCTProduct(const ParGridFunction &us, const Vector &m,
+                                     const Vector &d_us_HO, const Vector &d_us_LO,
+                                     Vector &s_min, Vector &s_max,
+                                     const Vector &u_new,
+                                     const Array<bool> &active_el,
+                                     const Array<bool> &active_dofs, Vector &d_us)
+{
+   us.HostRead();
+   s_min.HostReadWrite();
+   s_max.HostReadWrite();
+   u_new.HostRead();
+   active_el.HostRead();
+   active_dofs.HostRead();
+
+   // Compute a compatible low-order solution.
+   Vector dus_lo_fct(us.Size()), us_min(us.Size()), us_max(us.Size());
+   CalcCompatibleLOProduct(us, m, d_us_HO, s_min, s_max, u_new,
+                           active_el, active_dofs, dus_lo_fct);
+   ScaleProductBounds(s_min, s_max, u_new, active_el, active_dofs,
+                      us_min, us_max);
+
+   // ClipScale solve for d_us.
+   CalcFCTSolution(us, m, d_us_HO, dus_lo_fct, us_min, us_max, d_us);
+   ZeroOutEmptyDofs(active_el, active_dofs, d_us);
+
+#ifdef REMHOS_FCT_PRODUCT_DEBUG
+   // Check the bounds of the final solution.
+   const int NE = pfes.GetNE();
+   const int ndofs = u_new.Size() / NE;
+   int dof_id;
+   const double eps = 1e-12;
+   Vector us_new(d_us.Size());
+   add(1.0, us, dt, d_us, us_new);
+   for (int k = 0; k < NE; k++)
+   {
+      if (active_el[k] == false) { continue; }
+
+      for (int j = 0; j < ndofs; j++)
+      {
+         dof_id = k*ndofs + j;
+         if (active_dofs[dof_id] == false) { continue; }
+
+         double s = us_new(dof_id) / u_new(dof_id);
+         if (s + eps < s_min(dof_id) ||
+             s - eps > s_max(dof_id))
+         {
+            std::cout << "Final s " << j << " " << k << " "
+                      << s_min(dof_id) << " "
+                      << s << " "
+                      << s_max(dof_id) << std::endl;
+            std::cout << "---\n";
+         }
+
+         if (us_new(dof_id) + eps < us_min(dof_id) ||
+             us_new(dof_id) - eps > us_max(dof_id))
+         {
+            std::cout << "Final us " << j << " " << k << " "
+                      << us_min(dof_id) << " "
+                      << us_new(dof_id) << " "
+                      << us_max(dof_id) << std::endl;
+            std::cout << "---\n";
+         }
+      }
+   }
+#endif
+}
+
+void ElementFCTProjection::CalcFCTSolution(const ParGridFunction &u,
+                                           const Vector &m,
+                                           const Vector &du_HO,
+                                           const Vector &du_LO,
+                                           const Vector &u_min,
+                                           const Vector &u_max,
+                                           Vector &du) const
+{
+   const int NE = pfes.GetMesh()->GetNE();
+   const int s  = pfes.GetFE(0)->GetDof();
+   int dof_id;
+
+   DenseMatrix M(s);
+   Vector ML(s), rhs(s), beta(s), z(s), u_loc, du_HO_loc, du_LO_loc, du_loc,
+          du_max_loc(s), du_min_loc(s);
+   MassIntegrator mass_integ;
+
+   for (int k = 0; k < NE; k++)
+   {
+      u_loc.SetDataAndSize(u.GetData() + k*s, s);
+      du_HO_loc.SetDataAndSize(du_HO.GetData() + k*s, s);
+      du_LO_loc.SetDataAndSize(du_LO.GetData() + k*s, s);
+      du_loc.SetDataAndSize(du.GetData() + k*s, s);
+
+      // Local max/min increments.
+      for (int i = 0; i < s; i++)
+      {
+         dof_id = k*s + i;
+         du_max_loc(i) = (u_max(dof_id) - u(dof_id)) / dt; // positive
+         du_min_loc(i) = (u_min(dof_id) - u(dof_id)) / dt; // negative
+      }
+
+      // Construct the local mass matrix.
+      ElementTransformation *T = pfes.GetMesh()->GetElementTransformation(k);
+      const FiniteElement *el = pfes.GetFE(k);
+      mass_integ.AssembleElementMatrix(*el, *T, M);
+
+      M.Mult(du_HO_loc, rhs);
+      M.GetRowSums(ML);
+
+      for (int i = 0; i < s; i++)
+      {
+         // Some different options for beta:
+         //beta(i) = 1.0;
+         beta(i) = ML(i);
+         //beta(i) = Mxy(i);
+
+         // The low order flux correction
+         z(i) = rhs(i) - ML(i) * du_LO_loc(i);
+      }
+
+      // Make beta_i sum to 1.
+      beta /= beta.Sum();
+
+      DenseMatrix F(s);
+      for (int i = 1; i < s; i++)
+      {
+         for (int j = 0; j < i; j++)
+         {
+            F(i, j) = M(i, j) * (du_HO_loc(i) - du_HO_loc(j)) +
+                      (beta(j) * z(i) - beta(i) * z(j));
+         }
+      }
+
+      Vector gp(s), gm(s);
+      gp = 0.0;
+      gm = 0.0;
+      for (int i = 1; i < s; i++)
+      {
+         for (int j = 0; j < i; j++)
+         {
+            double fij = F(i, j);
+            if (fij >= 0.0)
+            {
+               gp(i) += fij;
+               gm(j) -= fij;
+            }
+            else
+            {
+               gm(i) += fij;
+               gp(j) -= fij;
+            }
+         }
+      }
+
+      du_loc = du_LO_loc;
+
+      for (int i = 0; i < s; i++)
+      {
+         double rp = std::max(ML(i) * (du_max_loc(i) - du_loc(i)), 0.0);
+         double rm = std::min(ML(i) * (du_min_loc(i) - du_loc(i)), 0.0);
+         double sp = gp(i), sm = gm(i);
+
+         gp(i) = (rp < sp) ? rp / sp : 1.0;
+         gm(i) = (rm > sm) ? rm / sm : 1.0;
+      }
+
+      for (int i = 1; i < s; i++)
+      {
+         for (int j = 0; j < i; j++)
+         {
+            double fij = F(i, j), aij;
+
+            if (fij >= 0.0)
+            {
+               aij = std::min(gp(i), gm(j));
+            }
+            else
+            {
+               aij = std::min(gm(i), gp(j));
+            }
+
+            fij *= aij;
+            du_loc(i) += fij / ML(i);
+            du_loc(j) -= fij / ML(j);
+         }
+      }
+   } // element loop
+}
+
+void ElementFCTProjection::CalcFCTProduct(const ParGridFunction &us, const Vector &m,
+                                     const Vector &d_us_HO, const Vector &d_us_LO,
+                                     Vector &s_min, Vector &s_max,
+                                     const Vector &u_new,
+                                     const Array<bool> &active_el,
+                                     const Array<bool> &active_dofs, Vector &d_us)
+{
+   us.HostRead();
+   s_min.HostReadWrite();
+   s_max.HostReadWrite();
+   u_new.HostRead();
+   active_el.HostRead();
+   active_dofs.HostRead();
+
+   // Compute a compatible low-order solution.
+   Vector dus_lo_fct(us.Size()), us_min(us.Size()), us_max(us.Size());
+   CalcCompatibleLOProduct(us, m, d_us_HO, s_min, s_max, u_new,
+                           active_el, active_dofs, dus_lo_fct);
+   ScaleProductBounds(s_min, s_max, u_new, active_el, active_dofs,
+                      us_min, us_max);
+
+   // ClipScale solve for d_us.
+   CalcFCTSolution(us, m, d_us_HO, dus_lo_fct, us_min, us_max, d_us);
+   ZeroOutEmptyDofs(active_el, active_dofs, d_us);
+}
+
+void NonlinearPenaltySolver::CalcFCTSolution(const ParGridFunction &u,
+                                             const Vector &m,
+                                             const Vector &du_ho,
+                                             const Vector &du_lo,
+                                             const Vector &u_min,
+                                             const Vector &u_max,
+                                             Vector &du) const
+{
+   const int size = u.Size();
+   Vector du_ho_star(size);
+
+   double umin, umax;
+
+   u.HostRead();  m.HostRead();
+   du_ho.HostRead(); du_lo.HostRead();
+   u_min.HostRead(); u_max.HostRead();
+   du.HostReadWrite();
+
+   // Smoothness indicator.
+   ParGridFunction si_val;
+   if (smth_indicator)
+   {
+      smth_indicator->ComputeSmoothnessIndicator(u, si_val);
+   }
+
+   // Clipped flux.
+   for (int i = 0; i < size; i++)
+   {
+      umin = u_min(i);
+      umax = u_max(i);
+      if (smth_indicator)
+      {
+         smth_indicator->UpdateBounds(i, u(i) + dt * du_ho(i),
+                                      si_val, umin, umax);
+      }
+
+      // Note that this uses u(i) at the old time.
+      du_ho_star(i) = min( (umax - u(i)) / dt,
+                           max(du_ho(i), (umin - u(i)) / dt) );
+   }
+
+   // Non-conservative fluxes.
+   Vector fL(size), fH(size);
+   for (int i = 0; i < size; i++)
+   {
+      fL(i) = m(i) * (du_ho_star(i) - du_lo(i));
+      fH(i) = m(i) * (du_ho_star(i) - du_ho(i));
+   }
+
+   // Restore conservation.
+   Vector flux_correction(size);
+   CorrectFlux(fL, fH, flux_correction);
+
+   for (int i = 0; i < size; i++)
+   {
+      fL(i) += flux_correction(i);
+   }
+
+   for (int i = 0; i < size; i++)
+   {
+      du(i) = du_lo(i) + fL(i) / m(i);
+   }
+}
+
+void get_z(double lambda, const Vector &w, const Vector &flux, Vector &zz)
+{
+   for (int j=0; j<w.Size(); j++)
+   {
+      zz(j) = (abs(flux(j)) >= lambda*abs(w(j))) ? lambda * w(j) : flux(j);
+   }
+}
+
+double get_lambda_times_sum_z(double lambda,
+                              const Vector &w, const Vector &fluxL)
+{
+   double lambda_times_z;
+   double lambda_times_sum_z = 0.0;
+   for (int j=0; j<w.Size(); j++)
+   {
+      // In the second case, lambda * w(j) is set to fluxL(j).
+      lambda_times_z = (abs(fluxL(j)) >= lambda*abs(w(j)))
+                       ? lambda * w(j) : fluxL(j);
+
+      lambda_times_sum_z += lambda_times_z;
+   }
+   return lambda_times_sum_z;
+}
+
+void get_lambda(double delta, const Vector &w,
+                const Vector &fluxL, Vector &zz)
+{
+   // solve nonlinearity F(lambda)=0
+   double tol=1e-15;
+   double lambdaLower=0., lambdaUpper = 0.;
+   double FLower=0., FUpper=0.;
+
+   // compute starting F and check F at the min (lambda = 0).
+   double lambda = 1.0;
+   double F = delta - get_lambda_times_sum_z(lambda, w, fluxL);
+   double F0 = delta-get_lambda_times_sum_z(0.0, w, fluxL);
+   if (abs(F) <= tol)
+   {
+      get_z(lambda, w, fluxL, zz);
+   }
+   else if (abs(F0)<=tol)
+   {
+      get_z(0.0, w, fluxL, zz);
+   }
+
+   // solve non-linearity
+   // Get lambda values that give Fs on both sides of the zero.
+   double factor = 1.0;
+   do
+   {
+      factor *= 2.0;
+      lambdaLower = lambda/factor;
+      lambdaUpper = factor*lambda;
+      FLower = delta - get_lambda_times_sum_z(lambdaLower,w,fluxL);
+      FUpper = delta - get_lambda_times_sum_z(lambdaUpper,w,fluxL);
+   }
+   while (F*FLower > 0 && F*FUpper > 0);
+
+   // check if either of lambdaLower or lambdaUpper hit the solution
+   if (FLower==0.0)
+   {
+      get_z(lambdaLower, w, fluxL, zz);
+   }
+   else if (FUpper==0.0)
+   {
+      get_z(lambdaUpper, w, fluxL, zz);
+   }
+
+   // get STARTING lower and upper bounds for lambda
+   if (F*FLower < 0)
+   {
+      lambdaUpper = lambda;
+   }
+   else
+   {
+      lambdaLower = lambda;
+   }
+
+   // get STARTING lower and upper bounds on F
+   FLower = delta - get_lambda_times_sum_z(lambdaLower,w,fluxL);
+   FUpper = delta - get_lambda_times_sum_z(lambdaUpper,w,fluxL);
+
+   do
+   {
+      // compute new lambda and new F
+      lambda = 0.5*(lambdaLower+lambdaUpper);
+      F = delta - get_lambda_times_sum_z(lambda,w,fluxL);
+      if (F*FLower < 0)
+      {
+         lambdaUpper = lambda;
+         FUpper = F;
+      }
+      else
+      {
+         lambdaLower = lambda;
+         FLower = F;
+      }
+   }
+   while (abs(F)>tol);
+
+   lambda = 0.5*(lambdaLower+lambdaUpper);
+   get_z(lambda, w, fluxL, zz);
+}
+
+void NonlinearPenaltySolver::CorrectFlux(Vector &fluxL, Vector &fluxH,
+                                         Vector &flux_fix) const
+{
+   // This consider any definition of wi. If a violation on MPP is created,
+   // then wi is s.t. fi=0.
+   // The idea is to relax the penalization wi in favor of MPP
+   const int num_cells = pfes.GetNE();
+   const int xd = pfes.GetFE(0)->GetDof();
+
+   Array<int> ldofs;
+   Vector fluxL_z(xd), fluxH_z(xd), flux_correction_z(xd);
+   for (int i = 0; i < num_cells; i++)
+   {
+      pfes.GetElementDofs(i, ldofs);
+      fluxL.GetSubVector(ldofs, fluxL_z);
+      fluxH.GetSubVector(ldofs, fluxH_z);
+
+      double fp = 0.0, fn = 0.0;
+      for (int j = 0; j < xd; j++)
+      {
+         if (fluxL_z(j) >= 0.0) { fp += fluxL_z(j); }
+         else                   { fn += fluxL_z(j); }
+      }
+
+      double delta = fp + fn;
+
+      if (delta == 0.0)
+      {
+         flux_correction_z = 0.0;
+         flux_fix.SetSubVector(ldofs, flux_correction_z);
+         continue;
+      }
+
+      // compute penalization terms wi's as desired
+      Vector w(xd);
+      const double eps = pfes.GetMesh()->GetElementSize(0,0) / pfes.GetOrder(0);
+      for (int j = 0; j < xd; j++)
+      {
+         if (delta > 0.0)
+         {
+            w(j) = (fluxL_z(j) > 0.0)
+                   ? eps * abs(fluxL_z(j)) + abs(get_max_on_cellNi(fluxH_z))
+                   : 0.0;
+         }
+         else
+         {
+            w(j) = (fluxL_z(j) < 0.0)
+                   ? - eps * abs(fluxL_z(j)) - abs(get_max_on_cellNi(fluxH_z))
+                   : 0.0;
+         }
+      }
+
+      // compute lambda and the flux correction.
+      get_lambda(delta, w, fluxL_z, flux_correction_z);
+      flux_correction_z.Neg();
+
+      flux_fix.SetSubVector(ldofs, flux_correction_z);
+   }
+}
+
+double NonlinearPenaltySolver::get_max_on_cellNi(Vector &fluxH) const
+{
+   double MAX = -1.0;
+   for (int i = 0; i < fluxH.Size(); i++)
+   {
+      MAX = max(fabs(fluxH(i)), MAX);
+   }
+   return MAX;
+}
+
+} // namespace mfem

From 47ec6d853a89463811579a59c94f645d877d64fb Mon Sep 17 00:00:00 2001
From: august-knox <knox10@llnl.gov>
Date: Mon, 6 Jan 2025 16:04:42 -0800
Subject: [PATCH 19/24] replacing remhos_fct.cpp

---
 remhos_fct.cpp | 185 +++++++++++++++++++++----------------------------
 1 file changed, 78 insertions(+), 107 deletions(-)

diff --git a/remhos_fct.cpp b/remhos_fct.cpp
index be2454b..53dc595 100644
--- a/remhos_fct.cpp
+++ b/remhos_fct.cpp
@@ -81,23 +81,24 @@ void FCTSolver::CalcCompatibleLOProduct(const ParGridFunction &us,
          if (s_avg > smax &&
              mass_us - eps < smax * mass_u) { s_avg = smax; }
 
-#ifdef REMHOS_FCT_PRODUCT_DEBUG
-         // Check if s_avg = mass_us / mass_u is within the bounds of the full
-         // stencil of active dofs.
-         if (mass_us + eps < smin * mass_u ||
-             mass_us - eps > smax * mass_u ||
-             s_avg + eps < smin ||
-             s_avg - eps > smax)
+         if (verify_bounds)
          {
-            std::cout << "---\ns_avg element bounds: "
-                      << smin << " " << s_avg << " " << smax << std::endl;
-            std::cout << "Element " << k << std::endl;
-            std::cout << "Masses " << mass_us << " " << mass_u << std::endl;
-            PrintCellValues(k, NE, u_new, "u_loc: ");
+            // Check if s_avg = mass_us / mass_u is within the bounds of the
+            // full stencil of active dofs.
+            if (mass_us + eps < smin * mass_u ||
+                mass_us - eps > smax * mass_u ||
+                s_avg + eps < smin ||
+                s_avg - eps > smax)
+            {
+               std::cout << "---\ns_avg element bounds: "
+                         << smin << " " << s_avg << " " << smax << std::endl;
+               std::cout << "Element " << k << std::endl;
+               std::cout << "Masses " << mass_us << " " << mass_u << std::endl;
+               PrintCellValues(k, NE, u_new, "u_loc: ");
 
-            MFEM_ABORT("s_avg is not in the full stencil bounds!");
+               MFEM_ABORT("s_avg is not in the full stencil bounds!");
+            }
          }
-#endif
 
          // When s_avg is not in the local bounds for some dof (it should be
          // within the full stencil of active dofs), reset the bounds to s_avg.
@@ -114,32 +115,6 @@ void FCTSolver::CalcCompatibleLOProduct(const ParGridFunction &us,
          dof_id = k*ndofs + j;
          d_us_LO_new(dof_id) = (u_new(dof_id) * s_avg - us(dof_id)) / dt;
       }
-
-#ifdef REMHOS_FCT_PRODUCT_DEBUG
-      // Check the LO product solution.
-      double us_min, us_max;
-      for (int j = 0; j < ndofs; j++)
-      {
-         dof_id = k*ndofs + j;
-         if (active_dofs[dof_id] == false) { continue; }
-
-         us_min = s_min_loc(j) * u_new(dof_id);
-         us_max = s_max_loc(j) * u_new(dof_id);
-
-         if (s_avg * u_new(dof_id) + eps < us_min ||
-             s_avg * u_new(dof_id) - eps > us_max)
-         {
-            std::cout << "---\ns_avg * u: " << k << " "
-                      << us_min << " "
-                      << s_avg * u_new(dof_id) << " "
-                      << us_max << endl
-                      << u_new(dof_id) << " " << s_avg << endl
-                      << s_min_loc(j) << " " << s_max_loc(j) << "\n---\n";
-
-            MFEM_ABORT("s_avg * u not in bounds");
-         }
-      }
-#endif
    }
 }
 
@@ -286,43 +261,35 @@ void FluxBasedFCT::CalcFCTProduct(const ParGridFunction &us, const Vector &m,
       dus_lo_fct = d_us;
    }
 
-#ifdef REMHOS_FCT_PRODUCT_DEBUG
-   // Check the bounds of the final solution.
-   const double eps = 1e-12;
-   Vector us_new(d_us.Size());
-   add(1.0, us, dt, d_us, us_new);
-   for (int k = 0; k < NE; k++)
+   if (verify_bounds)
    {
-      if (active_el[k] == false) { continue; }
-
-      for (int j = 0; j < ndofs; j++)
+      // Check the bounds of the final solution.
+      const double eps = 1e-12;
+      Vector us_new(d_us.Size());
+      add(1.0, us, dt, d_us, us_new);
+      for (int k = 0; k < NE; k++)
       {
-         dof_id = k*ndofs + j;
-         if (active_dofs[dof_id] == false) { continue; }
+         if (active_el[k] == false) { continue; }
 
-         double s = us_new(dof_id) / u_new(dof_id);
-         if (s + eps < s_min(dof_id) ||
-             s - eps > s_max(dof_id))
+         for (int j = 0; j < ndofs; j++)
          {
-            std::cout << "Final s " << j << " " << k << " "
-                      << s_min(dof_id) << " "
-                      << s << " "
-                      << s_max(dof_id) << std::endl;
-            std::cout << "---\n";
-         }
+            dof_id = k*ndofs + j;
+            if (active_dofs[dof_id] == false) { continue; }
 
-         if (us_new(dof_id) + eps < us_min(dof_id) ||
-             us_new(dof_id) - eps > us_max(dof_id))
-         {
-            std::cout << "Final us " << j << " " << k << " "
-                      << us_min(dof_id) << " "
-                      << us_new(dof_id) << " "
-                      << us_max(dof_id) << std::endl;
-            std::cout << "---\n";
+            if (us_new(dof_id) + eps < us_min(dof_id) ||
+                us_new(dof_id) - eps > us_max(dof_id))
+            {
+               std::cout << "Final us " << j << " " << k << " "
+                         << us_min(dof_id) << " "
+                         << us_new(dof_id) << " "
+                         << us_max(dof_id) << std::endl;
+               std::cout << "---\n";
+
+               MFEM_ABORT("us not in bounds after FCT.");
+            }
          }
       }
    }
-#endif
 }
 
 void FluxBasedFCT::ComputeFluxMatrix(const ParGridFunction &u,
@@ -583,46 +550,49 @@ void ClipScaleSolver::CalcFCTProduct(const ParGridFunction &us, const Vector &m,
    CalcFCTSolution(us, m, d_us_HO, dus_lo_fct, us_min, us_max, d_us);
    ZeroOutEmptyDofs(active_el, active_dofs, d_us);
 
-#ifdef REMHOS_FCT_PRODUCT_DEBUG
-   // Check the bounds of the final solution.
-   const int NE = pfes.GetNE();
-   const int ndofs = u_new.Size() / NE;
-   int dof_id;
-   const double eps = 1e-12;
-   Vector us_new(d_us.Size());
-   add(1.0, us, dt, d_us, us_new);
-   for (int k = 0; k < NE; k++)
+   if (verify_bounds)
    {
-      if (active_el[k] == false) { continue; }
-
-      for (int j = 0; j < ndofs; j++)
+      // Check the bounds of the final solution.
+      const int NE = pfes.GetNE();
+      const int ndofs = u_new.Size() / NE;
+      int dof_id;
+      const double eps = 1e-12;
+      Vector us_new(d_us.Size());
+      add(1.0, us, dt, d_us, us_new);
+      for (int k = 0; k < NE; k++)
       {
-         dof_id = k*ndofs + j;
-         if (active_dofs[dof_id] == false) { continue; }
+         if (active_el[k] == false) { continue; }
 
-         double s = us_new(dof_id) / u_new(dof_id);
-         if (s + eps < s_min(dof_id) ||
-             s - eps > s_max(dof_id))
+         for (int j = 0; j < ndofs; j++)
          {
-            std::cout << "Final s " << j << " " << k << " "
-                      << s_min(dof_id) << " "
-                      << s << " "
-                      << s_max(dof_id) << std::endl;
-            std::cout << "---\n";
-         }
+            dof_id = k*ndofs + j;
+            if (active_dofs[dof_id] == false) { continue; }
 
-         if (us_new(dof_id) + eps < us_min(dof_id) ||
-             us_new(dof_id) - eps > us_max(dof_id))
-         {
-            std::cout << "Final us " << j << " " << k << " "
-                      << us_min(dof_id) << " "
-                      << us_new(dof_id) << " "
-                      << us_max(dof_id) << std::endl;
-            std::cout << "---\n";
+            /* // this doesn't check round-offs in the division.
+            double s = us_new(dof_id) / u_new(dof_id);
+            if (s + eps < s_min(dof_id) ||
+                s - eps > s_max(dof_id))
+            {
+               std::cout << "Final s " << j << " " << k << " "
+                         << s_min(dof_id) << " "
+                         << s << " "
+                         << s_max(dof_id) << std::endl;
+               std::cout << "---\n";
+            }*/
+
+            if (us_new(dof_id) + eps < us_min(dof_id) ||
+                us_new(dof_id) - eps > us_max(dof_id))
+            {
+               std::cout << "Final us " << j << " " << k << " "
+                         << us_min(dof_id) << " "
+                         << us_new(dof_id) << " "
+                         << us_max(dof_id) << std::endl;
+               std::cout << "---\n";
+               MFEM_ABORT("Bounds violation FCT us.");
+            }
          }
       }
    }
-#endif
 }
 
 void ElementFCTProjection::CalcFCTSolution(const ParGridFunction &u,
@@ -745,12 +715,13 @@ void ElementFCTProjection::CalcFCTSolution(const ParGridFunction &u,
    } // element loop
 }
 
-void ElementFCTProjection::CalcFCTProduct(const ParGridFunction &us, const Vector &m,
-                                     const Vector &d_us_HO, const Vector &d_us_LO,
-                                     Vector &s_min, Vector &s_max,
-                                     const Vector &u_new,
-                                     const Array<bool> &active_el,
-                                     const Array<bool> &active_dofs, Vector &d_us)
+void ElementFCTProjection::CalcFCTProduct(const ParGridFunction &us,
+                                          const Vector &m,
+                                          const Vector &d_us_HO, const Vector &d_us_LO,
+                                          Vector &s_min, Vector &s_max,
+                                          const Vector &u_new,
+                                          const Array<bool> &active_el,
+                                          const Array<bool> &active_dofs, Vector &d_us)
 {
    us.HostRead();
    s_min.HostReadWrite();

From 28c7b8c7d628121c512bff306231bd11b912d1ff Mon Sep 17 00:00:00 2001
From: august-knox <knox10@llnl.gov>
Date: Mon, 6 Jan 2025 16:15:18 -0800
Subject: [PATCH 20/24] upstream remhos_fct.cpp

---
 remhos_fct.cpp | 255 +++++++++++++++++++++++++++++--------------------
 1 file changed, 150 insertions(+), 105 deletions(-)

diff --git a/remhos_fct.cpp b/remhos_fct.cpp
index 53dc595..9621e44 100644
--- a/remhos_fct.cpp
+++ b/remhos_fct.cpp
@@ -56,8 +56,8 @@ void FCTSolver::CalcCompatibleLOProduct(const ParGridFunction &us,
       double s_avg = mass_us / mass_u;
 
       // Min and max of s using the full stencil of active dofs.
-      s_min_loc.SetDataAndSize(s_min.GetData() + k*ndofs, ndofs);
-      s_max_loc.SetDataAndSize(s_max.GetData() + k*ndofs, ndofs);
+      s_min_loc.SetDataAndSize(s_min.HostReadWrite() + k*ndofs, ndofs);
+      s_max_loc.SetDataAndSize(s_max.HostReadWrite() + k*ndofs, ndofs);
       double smin = numeric_limits<double>::infinity(),
              smax = -numeric_limits<double>::infinity();
       for (int j = 0; j < ndofs; j++)
@@ -81,24 +81,23 @@ void FCTSolver::CalcCompatibleLOProduct(const ParGridFunction &us,
          if (s_avg > smax &&
              mass_us - eps < smax * mass_u) { s_avg = smax; }
 
-         if (verify_bounds)
+#ifdef REMHOS_FCT_PRODUCT_DEBUG
+         // Check if s_avg = mass_us / mass_u is within the bounds of the full
+         // stencil of active dofs.
+         if (mass_us + eps < smin * mass_u ||
+             mass_us - eps > smax * mass_u ||
+             s_avg + eps < smin ||
+             s_avg - eps > smax)
          {
-            // Check if s_avg = mass_us / mass_u is within the bounds of the
-            // full stencil of active dofs.
-            if (mass_us + eps < smin * mass_u ||
-                mass_us - eps > smax * mass_u ||
-                s_avg + eps < smin ||
-                s_avg - eps > smax)
-            {
-               std::cout << "---\ns_avg element bounds: "
-                         << smin << " " << s_avg << " " << smax << std::endl;
-               std::cout << "Element " << k << std::endl;
-               std::cout << "Masses " << mass_us << " " << mass_u << std::endl;
-               PrintCellValues(k, NE, u_new, "u_loc: ");
+            std::cout << "---\ns_avg element bounds: "
+                      << smin << " " << s_avg << " " << smax << std::endl;
+            std::cout << "Element " << k << std::endl;
+            std::cout << "Masses " << mass_us << " " << mass_u << std::endl;
+            PrintCellValues(k, NE, u_new, "u_loc: ");
 
-               MFEM_ABORT("s_avg is not in the full stencil bounds!");
-            }
+            MFEM_ABORT("s_avg is not in the full stencil bounds!");
          }
+#endif
 
          // When s_avg is not in the local bounds for some dof (it should be
          // within the full stencil of active dofs), reset the bounds to s_avg.
@@ -115,6 +114,32 @@ void FCTSolver::CalcCompatibleLOProduct(const ParGridFunction &us,
          dof_id = k*ndofs + j;
          d_us_LO_new(dof_id) = (u_new(dof_id) * s_avg - us(dof_id)) / dt;
       }
+
+#ifdef REMHOS_FCT_PRODUCT_DEBUG
+      // Check the LO product solution.
+      double us_min, us_max;
+      for (int j = 0; j < ndofs; j++)
+      {
+         dof_id = k*ndofs + j;
+         if (active_dofs[dof_id] == false) { continue; }
+
+         us_min = s_min_loc(j) * u_new(dof_id);
+         us_max = s_max_loc(j) * u_new(dof_id);
+
+         if (s_avg * u_new(dof_id) + eps < us_min ||
+             s_avg * u_new(dof_id) - eps > us_max)
+         {
+            std::cout << "---\ns_avg * u: " << k << " "
+                      << us_min << " "
+                      << s_avg * u_new(dof_id) << " "
+                      << us_max << endl
+                      << u_new(dof_id) << " " << s_avg << endl
+                      << s_min_loc(j) << " " << s_max_loc(j) << "\n---\n";
+
+            MFEM_ABORT("s_avg * u not in bounds");
+         }
+      }
+#endif
    }
 }
 
@@ -261,35 +286,43 @@ void FluxBasedFCT::CalcFCTProduct(const ParGridFunction &us, const Vector &m,
       dus_lo_fct = d_us;
    }
 
-   if (verify_bounds)
+#ifdef REMHOS_FCT_PRODUCT_DEBUG
+   // Check the bounds of the final solution.
+   const double eps = 1e-12;
+   Vector us_new(d_us.Size());
+   add(1.0, us, dt, d_us, us_new);
+   for (int k = 0; k < NE; k++)
    {
-      // Check the bounds of the final solution.
-      const double eps = 1e-12;
-      Vector us_new(d_us.Size());
-      add(1.0, us, dt, d_us, us_new);
-      for (int k = 0; k < NE; k++)
+      if (active_el[k] == false) { continue; }
+
+      for (int j = 0; j < ndofs; j++)
       {
-         if (active_el[k] == false) { continue; }
+         dof_id = k*ndofs + j;
+         if (active_dofs[dof_id] == false) { continue; }
 
-         for (int j = 0; j < ndofs; j++)
+         double s = us_new(dof_id) / u_new(dof_id);
+         if (s + eps < s_min(dof_id) ||
+             s - eps > s_max(dof_id))
          {
-            dof_id = k*ndofs + j;
-            if (active_dofs[dof_id] == false) { continue; }
-
-            if (us_new(dof_id) + eps < us_min(dof_id) ||
-                us_new(dof_id) - eps > us_max(dof_id))
-            {
-               std::cout << "Final us " << j << " " << k << " "
-                         << us_min(dof_id) << " "
-                         << us_new(dof_id) << " "
-                         << us_max(dof_id) << std::endl;
-               std::cout << "---\n";
+            std::cout << "Final s " << j << " " << k << " "
+                      << s_min(dof_id) << " "
+                      << s << " "
+                      << s_max(dof_id) << std::endl;
+            std::cout << "---\n";
+         }
 
-               MFEM_ABORT("us not in bounds after FCT.");
-            }
+         if (us_new(dof_id) + eps < us_min(dof_id) ||
+             us_new(dof_id) - eps > us_max(dof_id))
+         {
+            std::cout << "Final us " << j << " " << k << " "
+                      << us_min(dof_id) << " "
+                      << us_new(dof_id) << " "
+                      << us_max(dof_id) << std::endl;
+            std::cout << "---\n";
          }
       }
    }
+#endif
 }
 
 void FluxBasedFCT::ComputeFluxMatrix(const ParGridFunction &u,
@@ -451,78 +484,93 @@ void ClipScaleSolver::CalcFCTSolution(const ParGridFunction &u, const Vector &m,
                                       const Vector &u_min, const Vector &u_max,
                                       Vector &du) const
 {
+   timer->sw_FCT.Start();
+
    const int NE = pfes.GetMesh()->GetNE();
    const int nd = pfes.GetFE(0)->GetDof();
-   Vector f_clip(nd);
-
-   int dof_id;
-   double sumPos, sumNeg, u_new_ho, u_new_lo, new_mass, f_clip_min, f_clip_max;
-   double umin, umax;
-   const double eps = 1.0e-15;
+   Vector f_clip(nd*NE);
 
    // Smoothness indicator.
    ParGridFunction si_val;
+
    if (smth_indicator)
    {
+      MFEM_ABORT("smth_indicator not supported in kernel!");
       smth_indicator->ComputeSmoothnessIndicator(u, si_val);
    }
 
-   u.HostRead();
-   m.HostRead();
-   du.HostReadWrite();
-   du_lo.HostRead(); du_ho.HostRead();
-   for (int k = 0; k < NE; k++)
+   const auto N4t = dt;
+
+   const auto U = mfem::Reshape(u.Read(), NE*nd);
+   const auto M = mfem::Reshape(m.Read(), NE*nd);
+   const auto DU_LO = mfem::Reshape(du_lo.Read(), NE*nd);
+   const auto DU_HO = mfem::Reshape(du_ho.Read(), NE*nd);
+   const auto U_MIN = mfem::Reshape(u_min.Read(), NE*nd);
+   const auto U_MAX = mfem::Reshape(u_max.Read(), NE*nd);
+
+   auto F_CLIP = mfem::Reshape(f_clip.Write(), NE, nd);
+   auto DU = mfem::Reshape(du.Write(), NE*nd);
+
+   const bool update_bounds = smth_indicator != nullptr;
+   if (update_bounds) MFEM_ABORT("UpdateBounds not ported to device")
+
+      mfem::forall(NE, [=] MFEM_HOST_DEVICE (int k)
    {
-      sumPos = sumNeg = 0.0;
+      constexpr auto eps = 1.0e-15;
+      double sumPos = 0.0, sumNeg = 0.0;
 
       // Clip.
       for (int j = 0; j < nd; j++)
       {
-         dof_id = k*nd+j;
+         const int dof_id = k*nd+j;
+
+         const auto u_new_lo = U(dof_id) + N4t * DU_LO(dof_id);
 
-         u_new_ho   = u(dof_id) + dt * du_ho(dof_id);
-         u_new_lo   = u(dof_id) + dt * du_lo(dof_id);
+         auto umin = U_MIN(dof_id);
+         auto umax = U_MAX(dof_id);
 
-         umin = u_min(dof_id);
-         umax = u_max(dof_id);
-         if (smth_indicator)
+#if !defined(MFEM_USE_CUDA) && !defined(MFEM_USE_HIP)
+         if (update_bounds)
          {
+            const auto u_new_ho   = U(dof_id) + N4t * DU_HO(dof_id);
             smth_indicator->UpdateBounds(dof_id, u_new_ho, si_val, umin, umax);
          }
+#endif
 
-         f_clip_min = m(dof_id) / dt * (umin - u_new_lo);
-         f_clip_max = m(dof_id) / dt * (umax - u_new_lo);
+         const auto f_clip_min = M(dof_id) / N4t * (umin - u_new_lo);
+         const auto f_clip_max = M(dof_id) / N4t * (umax - u_new_lo);
 
-         f_clip(j) = m(dof_id) * (du_ho(dof_id) - du_lo(dof_id));
-         f_clip(j) = min(f_clip_max, max(f_clip_min, f_clip(j)));
+         F_CLIP(k,j) = M(dof_id) * (DU_HO(dof_id) - DU_LO(dof_id));
+         F_CLIP(k,j) = fmin(f_clip_max, fmax(f_clip_min, F_CLIP(k,j)));
 
-         sumNeg += min(f_clip(j), 0.0);
-         sumPos += max(f_clip(j), 0.0);
+         sumNeg += fmin(F_CLIP(k,j), 0.0);
+         sumPos += fmax(F_CLIP(k,j), 0.0);
       }
 
-      new_mass = sumNeg + sumPos;
+      double new_mass = sumNeg + sumPos;
 
       // Rescale.
       for (int j = 0; j < nd; j++)
       {
          if (new_mass > eps)
          {
-            f_clip(j) = min(0.0, f_clip(j)) -
-                        max(0.0, f_clip(j)) * sumNeg / sumPos;
+            F_CLIP(k,j) = fmin(0.0, F_CLIP(k,j)) -
+                          fmax(0.0, F_CLIP(k,j)) * sumNeg / sumPos;
          }
          if (new_mass < -eps)
          {
-            f_clip(j) = max(0.0, f_clip(j)) -
-                        min(0.0, f_clip(j)) * sumPos / sumNeg;
+            F_CLIP(k,j) = fmax(0.0, F_CLIP(k,j)) -
+                          fmin(0.0, F_CLIP(k,j)) * sumPos / sumNeg;
          }
 
          // Set du to the discrete time derivative featuring the high order
          // anti-diffusive reconstruction that leads to an forward Euler
          // updated admissible solution.
-         dof_id = k*nd+j;
-         du(dof_id) = du_lo(dof_id) + f_clip(j) / m(dof_id);
+         const int dof_id = k*nd+j;
+         DU(dof_id) = DU_LO(dof_id) + F_CLIP(k,j) / M(dof_id);
       }
-   }
+   });
+   timer->sw_FCT.Stop();
 }
 
 void ClipScaleSolver::CalcFCTProduct(const ParGridFunction &us, const Vector &m,
@@ -550,49 +598,46 @@ void ClipScaleSolver::CalcFCTProduct(const ParGridFunction &us, const Vector &m,
    CalcFCTSolution(us, m, d_us_HO, dus_lo_fct, us_min, us_max, d_us);
    ZeroOutEmptyDofs(active_el, active_dofs, d_us);
 
-   if (verify_bounds)
+#ifdef REMHOS_FCT_PRODUCT_DEBUG
+   // Check the bounds of the final solution.
+   const int NE = pfes.GetNE();
+   const int ndofs = u_new.Size() / NE;
+   int dof_id;
+   const double eps = 1e-12;
+   Vector us_new(d_us.Size());
+   add(1.0, us, dt, d_us, us_new);
+   for (int k = 0; k < NE; k++)
    {
-      // Check the bounds of the final solution.
-      const int NE = pfes.GetNE();
-      const int ndofs = u_new.Size() / NE;
-      int dof_id;
-      const double eps = 1e-12;
-      Vector us_new(d_us.Size());
-      add(1.0, us, dt, d_us, us_new);
-      for (int k = 0; k < NE; k++)
+      if (active_el[k] == false) { continue; }
+
+      for (int j = 0; j < ndofs; j++)
       {
-         if (active_el[k] == false) { continue; }
+         dof_id = k*ndofs + j;
+         if (active_dofs[dof_id] == false) { continue; }
 
-         for (int j = 0; j < ndofs; j++)
+         double s = us_new(dof_id) / u_new(dof_id);
+         if (s + eps < s_min(dof_id) ||
+             s - eps > s_max(dof_id))
          {
-            dof_id = k*ndofs + j;
-            if (active_dofs[dof_id] == false) { continue; }
+            std::cout << "Final s " << j << " " << k << " "
+                      << s_min(dof_id) << " "
+                      << s << " "
+                      << s_max(dof_id) << std::endl;
+            std::cout << "---\n";
+         }
 
-            /* // this doesn't check round-offs in the division.
-            double s = us_new(dof_id) / u_new(dof_id);
-            if (s + eps < s_min(dof_id) ||
-                s - eps > s_max(dof_id))
-            {
-               std::cout << "Final s " << j << " " << k << " "
-                         << s_min(dof_id) << " "
-                         << s << " "
-                         << s_max(dof_id) << std::endl;
-               std::cout << "---\n";
-            }*/
-
-            if (us_new(dof_id) + eps < us_min(dof_id) ||
-                us_new(dof_id) - eps > us_max(dof_id))
-            {
-               std::cout << "Final us " << j << " " << k << " "
-                         << us_min(dof_id) << " "
-                         << us_new(dof_id) << " "
-                         << us_max(dof_id) << std::endl;
-               std::cout << "---\n";
-               MFEM_ABORT("Bounds violation FCT us.");
-            }
+         if (us_new(dof_id) + eps < us_min(dof_id) ||
+             us_new(dof_id) - eps > us_max(dof_id))
+         {
+            std::cout << "Final us " << j << " " << k << " "
+                      << us_min(dof_id) << " "
+                      << us_new(dof_id) << " "
+                      << us_max(dof_id) << std::endl;
+            std::cout << "---\n";
          }
       }
    }
+#endif
 }
 
 void ElementFCTProjection::CalcFCTSolution(const ParGridFunction &u,

From 4a89f5abd3c44a0b2cb91c88a8d7acbf2aa7cb38 Mon Sep 17 00:00:00 2001
From: august-knox <knox10@llnl.gov>
Date: Tue, 7 Jan 2025 09:08:15 -0800
Subject: [PATCH 21/24] making caliper option clearer

---
 makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/makefile b/makefile
index af825f5..72a41df 100644
--- a/makefile
+++ b/makefile
@@ -64,11 +64,11 @@ TEST_MK = $(MFEM_DIR)/config/test.mk
 MFEM_DIR1 := $(MFEM_DIR)
 MFEM_DIR2 := $(realpath $(MFEM_DIR))
 
-# Caliper/Adiak flags
+# Use Caliper annotations
 
-#CALIPER_DIR = $(spack location --install-dir caliper)
-#ADIAK_DIR = $(spack location --install-dir adiak)
-ifdef CALIPER_DIR
+ifdef -DUSE_CALIPER
+CALIPER_DIR = $(spack location --install-dir caliper)
+ADIAK_DIR = $(spack location --install-dir adiak)
 CALIPER_FLAGS = -I${CALIPER_DIR}/include -DUSE_CALIPER
 ADIAK_INCLUDE = -I${ADIAK_DIR}/include 
 ADIAK_LDFLAGS =  -L${ADIAK_DIR}/lib -ladiak

From 61aeca020b96375adfa6354d5b39308f710d7e40 Mon Sep 17 00:00:00 2001
From: august-knox <knox10@llnl.gov>
Date: Tue, 7 Jan 2025 09:16:48 -0800
Subject: [PATCH 22/24] changing use_caliper syntax

---
 makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/makefile b/makefile
index 72a41df..a1865f2 100644
--- a/makefile
+++ b/makefile
@@ -66,7 +66,7 @@ MFEM_DIR2 := $(realpath $(MFEM_DIR))
 
 # Use Caliper annotations
 
-ifdef -DUSE_CALIPER
+ifdef USE_CALIPER
 CALIPER_DIR = $(spack location --install-dir caliper)
 ADIAK_DIR = $(spack location --install-dir adiak)
 CALIPER_FLAGS = -I${CALIPER_DIR}/include -DUSE_CALIPER

From 12b052be24a5aa9c7765b0695d385602bd2df617 Mon Sep 17 00:00:00 2001
From: august-knox <knox10@llnl.gov>
Date: Tue, 7 Jan 2025 09:29:12 -0800
Subject: [PATCH 23/24] returning to original method

---
 makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/makefile b/makefile
index a1865f2..1434602 100644
--- a/makefile
+++ b/makefile
@@ -66,7 +66,7 @@ MFEM_DIR2 := $(realpath $(MFEM_DIR))
 
 # Use Caliper annotations
 
-ifdef USE_CALIPER
+ifdef CALIPER_DIR
 CALIPER_DIR = $(spack location --install-dir caliper)
 ADIAK_DIR = $(spack location --install-dir adiak)
 CALIPER_FLAGS = -I${CALIPER_DIR}/include -DUSE_CALIPER

From 777013652ac1711d360a8431edc8232beb21e63c Mon Sep 17 00:00:00 2001
From: august-knox <112430443+august-knox@users.noreply.github.com>
Date: Tue, 7 Jan 2025 10:02:31 -0800
Subject: [PATCH 24/24] Update remhos_fct.cpp

adding delta symbols back
---
 remhos_fct.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/remhos_fct.cpp b/remhos_fct.cpp
index 9621e44..d3cb1cc 100644
--- a/remhos_fct.cpp
+++ b/remhos_fct.cpp
@@ -499,7 +499,7 @@ void ClipScaleSolver::CalcFCTSolution(const ParGridFunction &u, const Vector &m,
       smth_indicator->ComputeSmoothnessIndicator(u, si_val);
    }
 
-   const auto N4t = dt;
+   const auto δt = dt;
 
    const auto U = mfem::Reshape(u.Read(), NE*nd);
    const auto M = mfem::Reshape(m.Read(), NE*nd);
@@ -524,7 +524,7 @@ void ClipScaleSolver::CalcFCTSolution(const ParGridFunction &u, const Vector &m,
       {
          const int dof_id = k*nd+j;
 
-         const auto u_new_lo = U(dof_id) + N4t * DU_LO(dof_id);
+         const auto u_new_lo = U(dof_id) + δt * DU_LO(dof_id);
 
          auto umin = U_MIN(dof_id);
          auto umax = U_MAX(dof_id);
@@ -532,13 +532,13 @@ void ClipScaleSolver::CalcFCTSolution(const ParGridFunction &u, const Vector &m,
 #if !defined(MFEM_USE_CUDA) && !defined(MFEM_USE_HIP)
          if (update_bounds)
          {
-            const auto u_new_ho   = U(dof_id) + N4t * DU_HO(dof_id);
+            const auto u_new_ho   = U(dof_id) + δt * DU_HO(dof_id);
             smth_indicator->UpdateBounds(dof_id, u_new_ho, si_val, umin, umax);
          }
 #endif
 
-         const auto f_clip_min = M(dof_id) / N4t * (umin - u_new_lo);
-         const auto f_clip_max = M(dof_id) / N4t * (umax - u_new_lo);
+         const auto f_clip_min = M(dof_id) / δt * (umin - u_new_lo);
+         const auto f_clip_max = M(dof_id) / δt * (umax - u_new_lo);
 
          F_CLIP(k,j) = M(dof_id) * (DU_HO(dof_id) - DU_LO(dof_id));
          F_CLIP(k,j) = fmin(f_clip_max, fmax(f_clip_min, F_CLIP(k,j)));