diff --git a/amgx_precon_postprocess-base.py b/amgx_precon_postprocess-base.py
new file mode 100644
index 0000000..18ea9a0
--- /dev/null
+++ b/amgx_precon_postprocess-base.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
+# the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
+# reserved. See files LICENSE and NOTICE for details.
+#
+# This file is part of CEED, a collection of benchmarks, miniapps, software
+# libraries and APIs for efficient high-order finite element and spectral
+# element discretizations for exascale applications. For more information and
+# source code availability see http://github.com/ceed.
+#
+# The CEED research is supportted by the Exascale Computing Project
+# (17-SC-20-SC), a collaborative effort of two U.S. Department of Energy
+# organizations (Office of Science and the National Nuclear Security
+# Administration) responsible for the planning and preparation of a capable
+# exascale ecosystem, including software, applications, hardware, advanced
+# system engineering and early testbed platforms, in support of the nation's
+# exascale computing imperative.
+
+from sys import stdout as out
+import fileinput
+import pprint
+
+#####   Read all input files specified on the command line, or stdin and parse
+#####   the content, storing it as a list of dictionaries - one dictionary for
+#####   each test run.
+
+it=fileinput.input()
+state=0
+line=''
+i=0
+mesh_p=1
+config='unknown'
+compiler='unknown'
+test='unknown'
+num_procs=0
+num_procs_node=0
+lnfmt='%05i'
+data={}
+runs=[]
+
+while True:
+   ##
+   if state%2==0:
+      ##
+      try:
+         line=it.next()
+         i=i+1
+      except StopIteration:
+         break
+      state=state+1
+      ##
+   elif state==1:
+      ##
+      state=0
+      if 'Reading configuration' in line:
+         ##
+         ## This is the beginning of a new file.
+         ##
+         # out.write('\n'+lnfmt%i+': %s'%line)
+         config=line.split()[2]
+         num_procs=0
+         num_procs_node=0
+      elif 'Setting up compiler' in line:
+         # out.write(lnfmt%i+': %s'%line)
+         compiler=line.split()[3]
+      elif 'Reading test file: ' in line:
+         # out.write(lnfmt%i+': %s'%line)
+         test=line.strip().split('Reading test file: ',1)[-1]
+      elif 'Running the tests using a total of' in line:
+         # out.write(lnfmt%i+': %s'%line)
+         num_procs=int(line.split('a total of ',1)[1].split(None,1)[0])
+      elif 'tasks per node' in line:
+         # out.write(lnfmt%i+': %s'%line)
+         num_procs_node=int(line.split(' tasks per',1)[0].rsplit(None,1)[1])
+      elif line == 'Options used:\n':
+         ##
+         ## This is the beginning of a new run.
+         ##
+         if 'cg-iteration-dps' in data:
+            runs.append(data)
+            # print
+            # pprint.pprint(data)
+         data={}
+         data['file']=fileinput.filename()
+         data['config']=config
+         data['compiler']=compiler
+         data['test']=test
+         data['num-procs']=num_procs
+         data['num-procs-node']=num_procs_node
+         data['mesh-order']=mesh_p
+         data['code']="MFEM"
+         test_=test.rsplit('/',1)[-1]
+         data['case']='scalar' if (('bp1' in test_) or ('bp3' in test_) or
+                                   ('bp5' in test_)) else 'vector'
+         # out.write('\n'+lnfmt%i+': %s'%line)
+         # out.write('*'*len(lnfmt%i)+':    --mesh-p %i\n'%mesh_p)
+         state=2
+      elif 'setup' in line:
+         data['amg-setup'] = float(line.split()[1])
+      #Used when using AmgX as a solver
+      elif 'solve(per iteration)' in line:
+         data['cost-of-precon'] = float(line.split()[2])
+      elif 'Total iterations' in line:
+         data['iterations'] = float(line.split()[2])
+      elif 'Time per CG step' in line:
+         data['time-per-cg-step'] = float(line.split()[4])
+      elif 'Total CG time' in line:
+         data['total-cg-time'] = float(line.split()[3])
+      elif '"DOFs/sec" in CG' in line:
+         # out.write(lnfmt%i+': %s'%line)
+         data['cg-iteration-dps']=1e6*float(line.split(' ')[3])
+      elif 'Number of finite element unknowns' in line:
+         # out.write(lnfmt%i+': %s'%line)
+         data['num-unknowns']=long(line.rsplit(None,1)[1])
+      elif 'Number of qudrature points per element' in line:
+         # out.write(lnfmt%i+': %s'%line)
+         data['quadrature-pts']=int(line.split(' = ',1)[1])
+      elif 'Total number of elements:' in line:
+         # out.write(lnfmt%i+': %s'%line)
+         data['num-elem']=int(line.rsplit(None,1)[1])
+      ##
+   elif state==3:
+      ##
+      if line[:5] == '   --':
+         # out.write(lnfmt%i+': %s'%line)
+         state=2
+         opt=line[5:].strip().split(' ',1)
+         if opt[0] in ('order'):
+            data[opt[0]]=int(opt[1])
+         elif opt[0] in ('device'):
+            data['mfem-device']=opt[1]
+      else:
+         state=1
+
+if 'cg-iteration-dps' in data:
+   runs.append(data)
+   # print
+   # pprint.pprint(data)
+for run in runs:
+   if not 'quadrature-pts' in run:
+      run['quadrature-pts']=0
+# print
+# print
+# pprint.pprint(runs)
+
+print 'Number of test runs read: %i'%len(runs)
diff --git a/amgx_precon_postprocess-plot-2.py b/amgx_precon_postprocess-plot-2.py
new file mode 100644
index 0000000..d217186
--- /dev/null
+++ b/amgx_precon_postprocess-plot-2.py
@@ -0,0 +1,220 @@
+# Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
+# the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
+# reserved. See files LICENSE and NOTICE for details.
+#
+# This file is part of CEED, a collection of benchmarks, miniapps, software
+# libraries and APIs for efficient high-order finite element and spectral
+# element discretizations for exascale applications. For more information and
+# source code availability see http://github.com/ceed.
+#
+# The CEED research is supported by the Exascale Computing Project
+# (17-SC-20-SC), a collaborative effort of two U.S. Department of Energy
+# organizations (Office of Science and the National Nuclear Security
+# Administration) responsible for the planning and preparation of a capable
+# exascale ecosystem, including software, applications, hardware, advanced
+# system engineering and early testbed platforms, in support of the nation's
+# exascale computing imperative.
+
+#my_y_data = 'amg-setup'
+#my_y_data = 'time-per-cg-step'
+my_y_data = 'total-cg-time'
+
+#####   Adjustable plot parameters:
+log_y=0               # use log scale on the y-axis?
+#x_range=(1e3,6e6)    # plot range for the x-axis; comment out for auto
+#y_range=(0,2.2e7)    # plot range for the y-axis; comment out for auto
+#y_range=(0,2.5e6)     # plot range for the y-axis; comment out for auto
+draw_iter_lines=0     # draw the "iter/s" lines?
+ymin_iter_lines=3e5   # minimal y value for the "iter/s" lines
+ymax_iter_lines=8e8   # maximal y value for the "iter/s" lines
+legend_ncol=(2 if log_y else 1)   # number of columns in the legend
+write_figures=1       # save the figures to files?
+show_figures=0        # display the figures on the screen?
+
+
+#####   Load the data
+execfile('amgx_precon_postprocess-base.py')
+
+
+#####   Sample plot output
+
+from matplotlib import use
+if not show_figures:
+   use('pdf')
+from pylab import *
+
+rcParams['font.sans-serif'].insert(0,'Noto Sans')
+rcParams['font.sans-serif'].insert(1,'Open Sans')
+rcParams['figure.figsize']=[10, 8] # default: 8 x 6
+
+cm=get_cmap('Set1') # 'Accent', 'Dark2', 'Set1', 'Set2', 'Set3'
+if '_segmentdata' in cm.__dict__:
+   cm_size=len(cm.__dict__['_segmentdata']['red'])
+elif 'colors' in cm.__dict__:
+   cm_size=len(cm.__dict__['colors'])
+colors=[cm(1.*i/(cm_size-1)) for i in range(cm_size)]
+
+# colors=['blue','green','crimson','turquoise','m','gold','cornflowerblue',
+#         'darkorange']
+
+sel_runs=runs
+configs=list(set([run['config'].rsplit('/',1)[-1].rsplit('.sh',1)[0]
+                  for run in sel_runs]))
+# print 'Present configurations:', configs
+config=configs[0]
+print 'Using configuration:', config
+config_short=config
+sel_runs=[run for run in sel_runs if
+          run['config'].rsplit('/',1)[-1].rsplit('.sh',1)[0]==config]
+
+tests=list(set([run['test'].rsplit('/',1)[-1].rsplit('.sh',1)[0]
+                for run in sel_runs]))
+# print 'Present tests:', tests
+test=tests[0]
+print 'Using test:', test
+test_short=test
+sel_runs=[run for run in sel_runs if
+          run['test'].rsplit('/',1)[-1].rsplit('.sh',1)[0]==test]
+
+vdim=1
+prob=test
+
+codes = list(set([run['code'] for run in sel_runs]))
+code  = codes[0]
+sel_runs=[run for run in sel_runs if run['code']==code]
+
+pl_set=[(run['compiler'],run['num-procs'],run['num-procs-node'],
+         run['mfem-device'])
+        for run in sel_runs]
+pl_set=sorted(set(pl_set))
+print
+pprint.pprint(pl_set)
+
+for plt in pl_set:
+   compiler=plt[0]
+   num_procs=plt[1]
+   num_procs_node=plt[2]
+   mfem_dev=plt[3]
+   num_nodes=num_procs/num_procs_node
+   pl_runs=[run for run in sel_runs
+            if run['compiler']==compiler and
+               run['num-procs']==num_procs and
+               run['num-procs-node']==num_procs_node and
+               run['mfem-device']==mfem_dev]
+   pl_runs=sorted(pl_runs)
+   if len(pl_runs)==0:
+      continue
+
+   print
+   print 'compiler: %s, compute nodes: %i, number of MPI tasks = %i'%(
+      compiler,num_nodes,num_procs)
+   print 'mfem device: %s'%mfem_dev
+
+   figure()
+   i=0
+   sol_p_set=sorted(set([run['order'] for run in pl_runs]))
+   for sol_p in sol_p_set:
+      qpts=sorted(list(set([run['quadrature-pts'] for run in pl_runs
+                            if run['order']==sol_p])))
+      qpts.reverse()
+      print 'Order: %i, quadrature points:'%sol_p, qpts
+      qpts_1d=[int(q**(1./3)+0.5) for q in qpts]
+
+      d=[[run['order'],run['num-elem'],1.*run['num-unknowns']/num_nodes/vdim,
+           (run['num-unknowns']/run[my_y_data])/1e6]
+#          run['cg-iteration-dps']/num_nodes]
+         for run in pl_runs
+         if run['order']==sol_p and
+            run['quadrature-pts']==qpts[0]]
+      # print
+      # print 'order = %i'%sol_p
+      # pprint.pprint(sorted(d))
+      d=[[e[2],e[3]] for e in d if e[0]==sol_p]
+      # (DOFs/[sec/iter]/node)/(DOFs/node) = iter/sec
+      d=[[nun,
+          min([e[1] for e in d if e[0]==nun]),
+          max([e[1] for e in d if e[0]==nun])]
+         for nun in set([e[0] for e in d])]
+      d=asarray(sorted(d))
+      ## Legend with 'q=...'
+      # plot(d[:,0],d[:,2],'o-',color=colors[i%cm_size],
+      #      label='p=%i, q=p+%i'%(sol_p,qpts_1d[0]-sol_p))
+      ## Legend without 'q=...'
+      print(d)
+      plot(d[:,0],d[:,2],'o-',color=colors[i%cm_size],
+           label='p=%i'%sol_p)
+      if list(d[:,1]) != list(d[:,2]):
+         plot(d[:,0],d[:,1],'o-',color=colors[i])
+         fill_between(d[:,0],d[:,1],d[:,2],facecolor=colors[i],alpha=0.2)
+      ##
+      if len(qpts)==1:
+         i=i+1
+         continue
+      d=[[run['order'],run['num-elem'],1.*run['num-unknowns']/num_nodes/vdim,
+          run['cg-iteration-dps']/num_nodes]
+         for run in pl_runs
+         if run['order']==sol_p and (run['quadrature-pts']==qpts[1])]
+      d=[[e[2],e[3]] for e in d if e[0]==sol_p]
+      if len(d)==0:
+         i=i+1
+         continue
+      d=[[nun,
+          min([e[1] for e in d if e[0]==nun]),
+          max([e[1] for e in d if e[0]==nun])]
+         for nun in set([e[0] for e in d])]
+      d=asarray(sorted(d))
+      plot(d[:,0],d[:,2],'s--',color=colors[i],
+           label='p=%i, q=p+%i'%(sol_p,qpts_1d[1]-sol_p))
+      if list(d[:,1]) != list(d[:,2]):
+         plot(d[:,0],d[:,1],'s--',color=colors[i])
+      ##
+      i=i+1
+   ##
+   if draw_iter_lines:
+      y0,y1=ymin_iter_lines,ymax_iter_lines
+      y=asarray([y0,y1]) if log_y else exp(linspace(log(y0), log(y1)))
+      slope1=600.
+      slope2=6000.
+      plot(y/slope1,y,'k--',label='%g iter/s'%(slope1/vdim))
+      plot(y/slope2,y,'k-',label='%g iter/s'%(slope2/vdim))
+
+   title('Config: %s/%s, host: %s (%i node%s, %i task%s/node), %s, %s'%(
+         code,mfem_dev,config_short,num_nodes,'' if num_nodes==1 else 's',
+         num_procs_node,'' if num_procs_node==1 else 's', compiler,
+         test))
+   xscale('log') # subsx=[2,4,6,8]
+   if log_y:
+      yscale('log')
+   if 'x_range' in vars() and len(x_range)==2:
+      xlim(x_range)
+   if 'y_range' in vars() and len(y_range)==2:
+      ylim(y_range)
+   # rng=arange(1e7,1.02e8,1e7)
+   # yticks(rng,['%i'%int(v/1e6) for v in rng])
+   # ylim(min(rng),max(rng))
+   # xlim(0.5,max([run['order'] for run in pl_runs])+0.5)
+   grid('on', color='gray', ls='dotted')
+   grid('on', axis='both', which='minor', color='gray', ls='dotted')
+   gca().set_axisbelow(True)
+   xlabel('Points per compute node')
+   my_y_label = 'DOFS /' + my_y_data
+   ylabel(my_y_label)
+   legend(ncol=legend_ncol, loc='best')
+
+   if write_figures: # write .pdf file?
+      pdf_file='plot2_%s_%s_%s_%s_%s_N%03i_pn%i.pdf'%(
+               code,mfem_dev,test,config_short,compiler,
+               num_nodes,num_procs_node)
+      print 'saving figure --> %s'%pdf_file
+      savefig(pdf_file, format='pdf', bbox_inches='tight')
+
+   if write_figures: # write .png file?
+      png_file='plot2_%s_%s_%s_%s_%s_N%03i_pn%i.png'%(
+               code,mfem_dev,test,config_short,compiler,
+               num_nodes,num_procs_node)
+      print 'saving figure --> %s'%png_file
+      savefig(png_file, format='png', bbox_inches='tight')
+
+if show_figures: # show the figures?
+   print '\nshowing figures ...'
+   show()
diff --git a/amgx_precon_postprocess-table.py b/amgx_precon_postprocess-table.py
new file mode 100644
index 0000000..59dd8b4
--- /dev/null
+++ b/amgx_precon_postprocess-table.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
+# the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
+# reserved. See files LICENSE and NOTICE for details.
+#
+# This file is part of CEED, a collection of benchmarks, miniapps, software
+# libraries and APIs for efficient high-order finite element and spectral
+# element discretizations for exascale applications. For more information and
+# source code availability see http://github.com/ceed.
+#
+# The CEED research is supportted by the Exascale Computing Project
+# (17-SC-20-SC), a collaborative effort of two U.S. Department of Energy
+# organizations (Office of Science and the National Nuclear Security
+# Administration) responsible for the planning and preparation of a capable
+# exascale ecosystem, including software, applications, hardware, advanced
+# system engineering and early testbed platforms, in support of the nation's
+# exascale computing imperative.
+
+
+#####   Load the data
+execfile('amgx_precon_postprocess-base.py')
+
+def safe_div(x,y):
+    if y == 0:
+        return 0
+    return x / y
+
+#####   Sample data output
+
+set1=sorted(
+   [(run['mfem-device'],
+     run['order'],run['compiler'],run['num-procs'],
+     run['num-unknowns'], safe_div(run['num-unknowns'],run['cost-of-precon'])/1e6 , run['iterations'],
+     safe_div(run['num-unknowns'],run['time-per-cg-step'])/1e6, safe_div(run['num-unknowns'],
+     run['total-cg-time'])/1e6, safe_div(run['num-unknowns'],run['amg-setup'])/1e6
+     ,run['cg-iteration-dps']/1e6)
+    for run in runs])
+
+out.write('''\
+    mfem   |    |  comp  |     |number of | precon dps    | number of  | cg-1-it dps |  cg-a-it dps | amg-set dps  |cg-iter dps
+   device  |  p |  iler  |  np |unknowns  |  millions     | iterations | millions    |  millions    |   millions   |millions
+-----------+----+--------+-----+---------+-------------+-------------+-------------+--------------+--------------+-------------
+''')
+line_fmt=' %9s | %2i | %6s | %3i | %6i  | %11.6f | %7i | %11.8f | %11.6f |  %11.6f |%11.6f\n'
+for run in set1:
+   out.write(line_fmt%run)
diff --git a/amgx_solver_postprocess-base.py b/amgx_solver_postprocess-base.py
new file mode 100644
index 0000000..e01095b
--- /dev/null
+++ b/amgx_solver_postprocess-base.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
+# the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
+# reserved. See files LICENSE and NOTICE for details.
+#
+# This file is part of CEED, a collection of benchmarks, miniapps, software
+# libraries and APIs for efficient high-order finite element and spectral
+# element discretizations for exascale applications. For more information and
+# source code availability see http://github.com/ceed.
+#
+# The CEED research is supportted by the Exascale Computing Project
+# (17-SC-20-SC), a collaborative effort of two U.S. Department of Energy
+# organizations (Office of Science and the National Nuclear Security
+# Administration) responsible for the planning and preparation of a capable
+# exascale ecosystem, including software, applications, hardware, advanced
+# system engineering and early testbed platforms, in support of the nation's
+# exascale computing imperative.
+
+from sys import stdout as out
+import fileinput
+import pprint
+
+#####   Read all input files specified on the command line, or stdin and parse
+#####   the content, storing it as a list of dictionaries - one dictionary for
+#####   each test run.
+
+it=fileinput.input()
+state=0
+line=''
+i=0
+mesh_p=1
+config='unknown'
+compiler='unknown'
+test='unknown'
+num_procs=0
+num_procs_node=0
+lnfmt='%05i'
+data={}
+runs=[]
+
+while True:
+   ##
+   if state%2==0:
+      ##
+      try:
+         line=it.next()
+         i=i+1
+      except StopIteration:
+         break
+      state=state+1
+      ##
+   elif state==1:
+      ##
+      state=0
+      if 'Reading configuration' in line:
+         ##
+         ## This is the beginning of a new file.
+         ##
+         # out.write('\n'+lnfmt%i+': %s'%line)
+         config=line.split()[2]
+         num_procs=0
+         num_procs_node=0
+      elif 'Setting up compiler' in line:
+         # out.write(lnfmt%i+': %s'%line)
+         compiler=line.split()[3]
+      elif 'Reading test file: ' in line:
+         # out.write(lnfmt%i+': %s'%line)
+         test=line.strip().split('Reading test file: ',1)[-1]
+      elif 'Running the tests using a total of' in line:
+         # out.write(lnfmt%i+': %s'%line)
+         num_procs=int(line.split('a total of ',1)[1].split(None,1)[0])
+      elif 'tasks per node' in line:
+         # out.write(lnfmt%i+': %s'%line)
+         num_procs_node=int(line.split(' tasks per',1)[0].rsplit(None,1)[1])
+      elif line == 'Options used:\n':
+         ##
+         ## This is the beginning of a new run.
+         ##
+         if 'cg-iteration-dps' in data:
+            runs.append(data)
+            # print
+            # pprint.pprint(data)
+         data={}
+         data['file']=fileinput.filename()
+         data['config']=config
+         data['compiler']=compiler
+         data['test']=test
+         data['num-procs']=num_procs
+         data['num-procs-node']=num_procs_node
+         data['mesh-order']=mesh_p
+         data['code']="MFEM"
+         test_=test.rsplit('/',1)[-1]
+         data['case']='scalar' if (('bp1' in test_) or ('bp3' in test_) or
+                                   ('bp5' in test_)) else 'vector'
+         # out.write('\n'+lnfmt%i+': %s'%line)
+         # out.write('*'*len(lnfmt%i)+':    --mesh-p %i\n'%mesh_p)
+         state=2
+      elif 'setup' in line: 
+         data['amg-setup'] = float(line.split()[1])
+      #Used when using AmgX as a solver
+      elif 'solve(per iteration)' in line:
+         data['time-per-cg-step'] = float(line.split()[2])
+      elif 'Total iterations' in line: 
+         data['iterations'] = float(line.split()[2])
+      elif 'Total CG time' in line:
+         data['total-cg-time'] = float(line.split()[3])
+      elif '"DOFs/sec" in CG' in line:
+         # out.write(lnfmt%i+': %s'%line)
+         data['cg-iteration-dps']=1e6*float(line.split(' ')[3])
+      elif 'Number of finite element unknowns' in line:
+         # out.write(lnfmt%i+': %s'%line)
+         data['num-unknowns']=long(line.rsplit(None,1)[1])
+      elif 'Number of qudrature points per element' in line:
+         # out.write(lnfmt%i+': %s'%line)
+         data['quadrature-pts']=int(line.split(' = ',1)[1])
+      elif 'Total number of elements:' in line:
+         # out.write(lnfmt%i+': %s'%line)
+         data['num-elem']=int(line.rsplit(None,1)[1])
+      ##
+   elif state==3:
+      ##
+      if line[:5] == '   --':
+         # out.write(lnfmt%i+': %s'%line)
+         state=2
+         opt=line[5:].strip().split(' ',1)
+         if opt[0] in ('order'):
+            data[opt[0]]=int(opt[1])
+         elif opt[0] in ('device'):
+            data['mfem-device']=opt[1]
+      else:
+         state=1
+
+if 'cg-iteration-dps' in data:
+   runs.append(data)
+   # print
+   # pprint.pprint(data)
+for run in runs:
+   if not 'quadrature-pts' in run:
+      run['quadrature-pts']=0
+# print
+# print
+# pprint.pprint(runs)
+
+print 'Number of test runs read: %i'%len(runs)
diff --git a/amgx_solver_postprocess-plot-2.py b/amgx_solver_postprocess-plot-2.py
new file mode 100644
index 0000000..3f83c01
--- /dev/null
+++ b/amgx_solver_postprocess-plot-2.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
+# the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
+# reserved. See files LICENSE and NOTICE for details.
+#
+# This file is part of CEED, a collection of benchmarks, miniapps, software
+# libraries and APIs for efficient high-order finite element and spectral
+# element discretizations for exascale applications. For more information and
+# source code availability see http://github.com/ceed.
+#
+# The CEED research is supported by the Exascale Computing Project
+# (17-SC-20-SC), a collaborative effort of two U.S. Department of Energy
+# organizations (Office of Science and the National Nuclear Security
+# Administration) responsible for the planning and preparation of a capable
+# exascale ecosystem, including software, applications, hardware, advanced
+# system engineering and early testbed platforms, in support of the nation's
+# exascale computing imperative.
+
+#my_y_data = 'amg-setup'
+#my_y_data = 'time-per-cg-step'
+my_y_data = 'total-cg-time'
+
+def safe_div(x,y):
+    if y == 0:
+        return 0
+    return x / y
+
+#####   Adjustable plot parameters:
+log_y=0               # use log scale on the y-axis?
+#x_range=(1e3,6e6)    # plot range for the x-axis; comment out for auto
+#y_range=(0,2.2e7)    # plot range for the y-axis; comment out for auto
+#y_range=(0,2.5e6)     # plot range for the y-axis; comment out for auto
+draw_iter_lines=0     # draw the "iter/s" lines?
+ymin_iter_lines=3e5   # minimal y value for the "iter/s" lines
+ymax_iter_lines=8e8   # maximal y value for the "iter/s" lines
+legend_ncol=(2 if log_y else 1)   # number of columns in the legend
+write_figures=1       # save the figures to files?
+show_figures=0        # display the figures on the screen?
+
+
+#####   Load the data
+execfile('amgx_solver_postprocess-base.py')
+
+
+#####   Sample plot output
+
+from matplotlib import use
+if not show_figures:
+   use('pdf')
+from pylab import *
+
+rcParams['font.sans-serif'].insert(0,'Noto Sans')
+rcParams['font.sans-serif'].insert(1,'Open Sans')
+rcParams['figure.figsize']=[10, 8] # default: 8 x 6
+
+cm=get_cmap('Set1') # 'Accent', 'Dark2', 'Set1', 'Set2', 'Set3'
+if '_segmentdata' in cm.__dict__:
+   cm_size=len(cm.__dict__['_segmentdata']['red'])
+elif 'colors' in cm.__dict__:
+   cm_size=len(cm.__dict__['colors'])
+colors=[cm(1.*i/(cm_size-1)) for i in range(cm_size)]
+
+# colors=['blue','green','crimson','turquoise','m','gold','cornflowerblue',
+#         'darkorange']
+
+sel_runs=runs
+configs=list(set([run['config'].rsplit('/',1)[-1].rsplit('.sh',1)[0]
+                  for run in sel_runs]))
+# print 'Present configurations:', configs
+config=configs[0]
+print 'Using configuration:', config
+config_short=config
+sel_runs=[run for run in sel_runs if
+          run['config'].rsplit('/',1)[-1].rsplit('.sh',1)[0]==config]
+
+tests=list(set([run['test'].rsplit('/',1)[-1].rsplit('.sh',1)[0]
+                for run in sel_runs]))
+# print 'Present tests:', tests
+test=tests[0]
+print 'Using test:', test
+test_short=test
+sel_runs=[run for run in sel_runs if
+          run['test'].rsplit('/',1)[-1].rsplit('.sh',1)[0]==test]
+
+vdim=1
+prob=test
+
+codes = list(set([run['code'] for run in sel_runs]))
+code  = codes[0]
+sel_runs=[run for run in sel_runs if run['code']==code]
+
+pl_set=[(run['compiler'],run['num-procs'],run['num-procs-node'],
+         run['mfem-device'])
+        for run in sel_runs]
+pl_set=sorted(set(pl_set))
+print
+pprint.pprint(pl_set)
+
+for plt in pl_set:
+   compiler=plt[0]
+   num_procs=plt[1]
+   num_procs_node=plt[2]
+   mfem_dev=plt[3]
+   num_nodes=num_procs/num_procs_node
+   pl_runs=[run for run in sel_runs
+            if run['compiler']==compiler and
+               run['num-procs']==num_procs and
+               run['num-procs-node']==num_procs_node and
+               run['mfem-device']==mfem_dev]
+   pl_runs=sorted(pl_runs)
+   if len(pl_runs)==0:
+      continue
+
+   print
+   print 'compiler: %s, compute nodes: %i, number of MPI tasks = %i'%(
+      compiler,num_nodes,num_procs)
+   print 'mfem device: %s'%mfem_dev
+
+   figure()
+   i=0
+   sol_p_set=sorted(set([run['order'] for run in pl_runs]))
+   for sol_p in sol_p_set:
+      qpts=sorted(list(set([run['quadrature-pts'] for run in pl_runs
+                            if run['order']==sol_p])))
+      qpts.reverse()
+      print 'Order: %i, quadrature points:'%sol_p, qpts
+      qpts_1d=[int(q**(1./3)+0.5) for q in qpts]
+
+      d=[[run['order'],run['num-elem'],1.*run['num-unknowns']/num_nodes/vdim,
+           safe_div(run['num-unknowns'],run[my_y_data])/1e6]
+#          run['cg-iteration-dps']/num_nodes]
+         for run in pl_runs
+         if run['order']==sol_p and
+            run['quadrature-pts']==qpts[0]]
+      # print
+      # print 'order = %i'%sol_p
+      # pprint.pprint(sorted(d))
+      d=[[e[2],e[3]] for e in d if e[0]==sol_p]
+      # (DOFs/[sec/iter]/node)/(DOFs/node) = iter/sec
+      d=[[nun,
+          min([e[1] for e in d if e[0]==nun]),
+          max([e[1] for e in d if e[0]==nun])]
+         for nun in set([e[0] for e in d])]
+      d=asarray(sorted(d))
+      ## Legend with 'q=...'
+      # plot(d[:,0],d[:,2],'o-',color=colors[i%cm_size],
+      #      label='p=%i, q=p+%i'%(sol_p,qpts_1d[0]-sol_p))
+      ## Legend without 'q=...'
+      print(d)
+      plot(d[:,0],d[:,2],'o-',color=colors[i%cm_size],
+           label='p=%i'%sol_p)
+      if list(d[:,1]) != list(d[:,2]):
+         plot(d[:,0],d[:,1],'o-',color=colors[i])
+         fill_between(d[:,0],d[:,1],d[:,2],facecolor=colors[i],alpha=0.2)
+      ##
+      if len(qpts)==1:
+         i=i+1
+         continue
+      d=[[run['order'],run['num-elem'],1.*run['num-unknowns']/num_nodes/vdim,
+          run['cg-iteration-dps']/num_nodes]
+         for run in pl_runs
+         if run['order']==sol_p and (run['quadrature-pts']==qpts[1])]
+      d=[[e[2],e[3]] for e in d if e[0]==sol_p]
+      if len(d)==0:
+         i=i+1
+         continue
+      d=[[nun,
+          min([e[1] for e in d if e[0]==nun]),
+          max([e[1] for e in d if e[0]==nun])]
+         for nun in set([e[0] for e in d])]
+      d=asarray(sorted(d))
+      plot(d[:,0],d[:,2],'s--',color=colors[i],
+           label='p=%i, q=p+%i'%(sol_p,qpts_1d[1]-sol_p))
+      if list(d[:,1]) != list(d[:,2]):
+         plot(d[:,0],d[:,1],'s--',color=colors[i])
+      ##
+      i=i+1
+   ##
+   if draw_iter_lines:
+      y0,y1=ymin_iter_lines,ymax_iter_lines
+      y=asarray([y0,y1]) if log_y else exp(linspace(log(y0), log(y1)))
+      slope1=600.
+      slope2=6000.
+      plot(y/slope1,y,'k--',label='%g iter/s'%(slope1/vdim))
+      plot(y/slope2,y,'k-',label='%g iter/s'%(slope2/vdim))
+
+   title('Config: %s/%s, host: %s (%i node%s, %i task%s/node), %s, %s'%(
+         code,mfem_dev,config_short,num_nodes,'' if num_nodes==1 else 's',
+         num_procs_node,'' if num_procs_node==1 else 's', compiler,
+         test))
+   xscale('log') # subsx=[2,4,6,8]
+   if log_y:
+      yscale('log')
+   if 'x_range' in vars() and len(x_range)==2:
+      xlim(x_range)
+   if 'y_range' in vars() and len(y_range)==2:
+      ylim(y_range)
+   # rng=arange(1e7,1.02e8,1e7)
+   # yticks(rng,['%i'%int(v/1e6) for v in rng])
+   # ylim(min(rng),max(rng))
+   # xlim(0.5,max([run['order'] for run in pl_runs])+0.5)
+   grid('on', color='gray', ls='dotted')
+   grid('on', axis='both', which='minor', color='gray', ls='dotted')
+   gca().set_axisbelow(True)
+   xlabel('Points per compute node')
+   my_y_label = 'DOFS /' + my_y_data
+   ylabel(my_y_label)
+   legend(ncol=legend_ncol, loc='best')
+
+   if write_figures: # write .pdf file?
+      pdf_file='plot2_%s_%s_%s_%s_%s_N%03i_pn%i.pdf'%(
+               code,mfem_dev,test,config_short,compiler,
+               num_nodes,num_procs_node)
+      print 'saving figure --> %s'%pdf_file
+      savefig(pdf_file, format='pdf', bbox_inches='tight')
+
+   if write_figures: # write .png file?
+      png_file='plot2_%s_%s_%s_%s_%s_N%03i_pn%i.png'%(
+               code,mfem_dev,test,config_short,compiler,
+               num_nodes,num_procs_node)
+      print 'saving figure --> %s'%png_file
+      savefig(png_file, format='png', bbox_inches='tight')
+
+if show_figures: # show the figures?
+   print '\nshowing figures ...'
+   show()
diff --git a/amgx_solver_postprocess-table.py b/amgx_solver_postprocess-table.py
new file mode 100644
index 0000000..995bd2a
--- /dev/null
+++ b/amgx_solver_postprocess-table.py
@@ -0,0 +1,44 @@
+
+# Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
+# the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
+# reserved. See files LICENSE and NOTICE for details.
+#
+# This file is part of CEED, a collection of benchmarks, miniapps, software
+# libraries and APIs for efficient high-order finite element and spectral
+# element discretizations for exascale applications. For more information and
+# source code availability see http://github.com/ceed.
+#
+# The CEED research is supportted by the Exascale Computing Project
+# (17-SC-20-SC), a collaborative effort of two U.S. Department of Energy
+# organizations (Office of Science and the National Nuclear Security
+# Administration) responsible for the planning and preparation of a capable
+# exascale ecosystem, including software, applications, hardware, advanced
+# system engineering and early testbed platforms, in support of the nation's
+# exascale computing imperative.
+
+
+#####   Load the data
+execfile('amgx_solver_postprocess-base.py')
+
+def safe_div(x,y):
+    if y == 0:
+        return 0
+    return x / y
+
+#####   Sample data output
+
+set1=sorted(
+   [(run['mfem-device'],
+     run['order'],run['compiler'],run['num-procs'],
+     run['num-unknowns'], run['iterations'],
+     safe_div(run['num-unknowns'],run['time-per-cg-step'])/1e6,  safe_div(run['num-unknowns'],run['total-cg-time'])/1e6, safe_div(run['num-unknowns'],run['amg-setup'])/1e6, run['cg-iteration-dps']/1e6)
+    for run in runs])
+
+out.write('''\
+    mfem   |    |  comp  |     |number of | number of   |  cg-1-it dps | cg-a-it dps  | amg-set dps  |cg-iter dps
+   device  |  p |  iler  |  np |unknowns  | iterations  |  millions    |  millions    |  millions    | millions
+-----------+----+--------+-----+----------+-------------+--------------+--------------+--------------+-------------
+''')
+line_fmt=' %9s | %2i | %6s | %3i | %6i  | %11i | %11.6f| %11.6f |%11.6f | %11.6f \n'
+for run in set1:
+   out.write(line_fmt%run)
diff --git a/run_all_amgx.sh b/run_all_amgx.sh
new file mode 100644
index 0000000..0703554
--- /dev/null
+++ b/run_all_amgx.sh
@@ -0,0 +1,5 @@
+#Using Block Jacobi Smoother
+ ./go.sh -c lassen -m xlc -r tests/mfem_amgx/ex1p_amgx_solver.sh num_proc_build=36  >> amgx_solver.txt &&
+ ./go.sh -c lassen -m xlc -r tests/mfem_amgx/ex1p_amgx_precon.sh num_proc_build=36  >> amgx_precon.txt &&
+ ./go.sh -c lassen -m xlc -r tests/mfem_amgx/ex1p_amgx_agg_precon.sh num_proc_build=36  >> amgx_agg_precon.txt &&
+ ./go.sh -c lassen -m xlc -r tests/mfem_amgx/ex1p_amgx_agg_solver.sh num_proc_build=36  >> amgx_agg_solver.txt
diff --git a/run_all_amgx_l1.sh b/run_all_amgx_l1.sh
new file mode 100644
index 0000000..0786ec2
--- /dev/null
+++ b/run_all_amgx_l1.sh
@@ -0,0 +1,5 @@
+#Using L1 Jacobi Smoother
+./go.sh -c lassen -m xlc -r tests/mfem_amgx/ex1p_amgx_solver_l1_jacobi.sh num_proc_build=36  >> amgx_solver_l1_jacobi.txt &&
+./go.sh -c lassen -m xlc -r tests/mfem_amgx/ex1p_amgx_precon_l1_jacobi.sh num_proc_build=36  >> amgx_precon_l1_jacobi.txt &&
+./go.sh -c lassen -m xlc -r tests/mfem_amgx/ex1p_amgx_agg_precon_l1_jacobi.sh num_proc_build=36  >> amgx_agg_precon_l1_jacobi.txt &&
+./go.sh -c lassen -m xlc -r tests/mfem_amgx/ex1p_amgx_agg_solver_l1_jacobi.sh num_proc_build=36  >> amgx_agg_solver_l1_jacobi.txt
diff --git a/tests/mfem_amgx/ex1p_amgx_agg_precon.cpp b/tests/mfem_amgx/ex1p_amgx_agg_precon.cpp
new file mode 100644
index 0000000..2b510b1
--- /dev/null
+++ b/tests/mfem_amgx/ex1p_amgx_agg_precon.cpp
@@ -0,0 +1,323 @@
+//                       MFEM Example 1 - Parallel Version
+//
+// Compile with: make ex1p
+//
+// Description:  This example code demonstrates the use of MFEM to define a
+//               simple finite element discretization of the Laplace problem
+//               -Delta u = 1 with homogeneous Dirichlet boundary conditions.
+//               Specifically, we discretize using a FE space of the specified
+//               order, or if order < 1 using an isoparametric/isogeometric
+//               space (i.e. quadratic for quadratic curvilinear mesh, NURBS for
+//               NURBS mesh, etc.)
+
+#include "mfem.hpp"
+#include <fstream>
+#include <iostream>
+
+using namespace std;
+using namespace mfem;
+
+Mesh *make_mesh(int myid, int num_procs, int dim, int level,
+                int &par_ref_levels, Array<int> &nxyz);
+
+int main(int argc, char *argv[])
+{
+   // 1. Initialize MPI.
+   int num_procs, myid;
+   MPI_Init(&argc, &argv);
+   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
+   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
+
+   // 2. Parse command-line options.
+   int dim = 3;
+   int level = 0;
+   int order = 1;
+   const char *device_config = "cuda";
+
+   OptionsParser args(argc, argv);
+   args.AddOption(&dim, "-dim", "--mesh-dimension",
+                  "Solve 2D or 3D problem.");
+   args.AddOption(&level, "-l", "--refinement-level",
+                  "Set the problem size: 2^level mesh elements per processor.");
+   args.AddOption(&order, "-o", "--order",
+                  "Finite element order (polynomial degree).");
+   args.AddOption(&device_config, "-d", "--device",
+                  "Device configuration string, see Device::Configure().");
+   args.Parse();
+   if (!args.Good())
+   {
+      if (myid == 0)
+      {
+         args.PrintUsage(cout);
+      }
+      MPI_Finalize();
+      return 1;
+   }
+   if (myid == 0)
+   {
+      args.PrintOptions(cout);
+   }
+
+   // 3. Enable hardware devices such as GPUs, and programming models such as
+   //    CUDA, OCCA, RAJA and OpenMP based on command line options.
+   Device device(device_config);
+   if (myid == 0) { device.Print(); }
+
+   // 4. Read the (serial) mesh from the given mesh file on all processors.  We
+   //    can handle triangular, quadrilateral, tetrahedral, hexahedral, surface
+   //    and volume meshes with the same code.
+   int par_ref_levels;
+   Array<int> nxyz;
+   Mesh *mesh = make_mesh(myid, num_procs, dim, level, par_ref_levels, nxyz);
+
+   // 5. Refine the serial mesh on all processors to increase the resolution. In
+   //    this example we do 'ref_levels' of uniform refinement. We choose
+   //    'ref_levels' to be the largest number that gives a final mesh with no
+   //    more than 10,000 elements.
+   {
+     // Disabled
+   }
+
+   // 6. Define a parallel mesh by a partitioning of the serial mesh. Refine
+   //    this mesh further in parallel to increase the resolution. Once the
+   //    parallel mesh is defined, the serial mesh can be deleted.
+   int *partitioning = nxyz.Size() ? mesh->CartesianPartitioning(nxyz) : NULL;
+   ParMesh *pmesh = new ParMesh(MPI_COMM_WORLD, *mesh, partitioning);
+   delete [] partitioning;
+   delete mesh;
+   for (int l = 0; l < par_ref_levels; l++)
+   {
+      pmesh->UniformRefinement();
+   }
+   // pmesh->PrintInfo();
+   long global_ne = pmesh->ReduceInt(pmesh->GetNE());
+   if (myid == 0)
+   {
+      cout << "Total number of elements: " << global_ne << endl;
+   }
+
+   // 7. Define a parallel finite element space on the parallel mesh. Here we
+   //    use continuous Lagrange finite elements of the specified order. If
+   //    order < 1, we instead use an isoparametric/isogeometric space.
+   FiniteElementCollection *fec;
+   MFEM_VERIFY(order > 0, "invalid 'order': " << order);
+   fec = new H1_FECollection(order, dim);
+   ParFiniteElementSpace *fespace = new ParFiniteElementSpace(pmesh, fec);
+   HYPRE_Int size = fespace->GlobalTrueVSize();
+   if (myid == 0)
+   {
+      cout << "Number of finite element unknowns: " << size << endl;
+   }
+
+   // 8. Determine the list of true (i.e. parallel conforming) essential
+   //    boundary dofs. In this example, the boundary conditions are defined
+   //    by marking all the boundary attributes from the mesh as essential
+   //    (Dirichlet) and converting them to a list of true dofs.
+   Array<int> ess_tdof_list;
+   if (pmesh->bdr_attributes.Size())
+   {
+      Array<int> ess_bdr(pmesh->bdr_attributes.Max());
+      ess_bdr = 1;
+      fespace->GetEssentialTrueDofs(ess_bdr, ess_tdof_list);
+   }
+
+   // 9. Set up the parallel linear form b(.) which corresponds to the
+   //    right-hand side of the FEM linear system, which in this case is
+   //    (1,phi_i) where phi_i are the basis functions in fespace.
+   ParLinearForm *b = new ParLinearForm(fespace);
+   ConstantCoefficient one(1.0);
+   b->AddDomainIntegrator(new DomainLFIntegrator(one));
+   b->Assemble();
+
+   // 10. Define the solution vector x as a parallel finite element grid function
+   //     corresponding to fespace. Initialize x with initial guess of zero,
+   //     which satisfies the boundary conditions.
+   ParGridFunction x(fespace);
+   x = 0.0;
+
+   // 11. Set up the parallel bilinear form a(.,.) on the finite element space
+   //     corresponding to the Laplacian operator -Delta, by adding the Diffusion
+   //     domain integrator.
+   ParBilinearForm *a = new ParBilinearForm(fespace);
+   a->AddDomainIntegrator(new DiffusionIntegrator(one));
+
+   // 12. Assemble the parallel bilinear form and the corresponding linear
+   //     system, applying any necessary transformations such as: parallel
+   //     assembly, eliminating boundary conditions, applying conforming
+   //     constraints for non-conforming AMR, static condensation, etc.
+   a->Assemble();
+
+   OperatorPtr A;
+   Vector B, X;
+   a->FormLinearSystem(ess_tdof_list, x, *b, A, X, B);
+
+   // 13. Solve the linear system A X = B.
+   //     * With full assembly, use the preconditioner from AmgX.
+
+   bool amgx_verbose = true;
+
+   string amgx_config = "{\n"
+                    " \"config_version\": 2, \n"
+                    " \"solver\": { \n"
+                    "   \"algorithm\": \"AGGREGATION\", \n"
+                    "   \"solver\": \"AMG\", \n"
+                    "     \"smoother\": { \n"
+                    "     \"scope\": \"jacobi\", \n"
+                    "     \"solver\": \"BLOCK_JACOBI\", \n"
+                    "     \"relaxation_factor\": 0.7 \n"
+                    "       }, \n"
+                    "   \"presweeps\": 1, \n"
+                    "   \"interpolator\": \"D2\", \n"
+                    "   \"selector\": \"SIZE_2\", \n" 
+                    "   \"max_row_sum\" : 0.9, \n"
+                    "   \"strength_threshold\" : 0.25, \n"
+                    "   \"postsweeps\": 1, \n"
+                    "   \"max_iters\": 1, \n"
+                    "   \"cycle\": \"V\"";
+   if (amgx_verbose)
+   {
+     amgx_config = amgx_config + ",\n"
+                   "   \"obtain_timings\": 1 \n";
+   }
+   else
+   {
+     amgx_config = amgx_config + "\n";
+   }
+   amgx_config = amgx_config + " }\n" + "}\n";
+
+   AmgXSolver *prec = new AmgXSolver;
+   prec->SetConvergenceCheck(false);
+   prec->ReadParameters(amgx_config, AmgXSolver::INTERNAL);
+   prec->InitExclusiveGPU(MPI_COMM_WORLD);
+
+   CGSolver cg(MPI_COMM_WORLD);
+   const int max_cg_iter = 200;
+   const int cg_print_level = 3;
+   cg.SetRelTol(1e-12);
+   cg.SetMaxIter(max_cg_iter);
+   cg.SetPrintLevel(cg_print_level);
+   if (prec) { cg.SetPreconditioner(*prec); }
+   cg.SetOperator(*A);
+
+   // Warm-up CG solve (in case of JIT to avoid timing it)
+   {
+      Vector Xtmp(X);
+      cg.SetMaxIter(2);
+      cg.SetPrintLevel(-1);
+      cg.Mult(B, Xtmp);
+      cg.SetMaxIter(max_cg_iter);
+      cg.SetPrintLevel(cg_print_level);
+   }
+
+   // Sync all ranks
+   MPI_Barrier(pmesh->GetComm());
+   
+   // Start CG timing.
+   tic_toc.Clear();
+   
+   // Start & Stop CG timing.
+   tic_toc.Start();
+   cg.Mult(B, X);
+   tic_toc.Stop();
+   double rt_min, rt_max, my_rt;
+   my_rt = tic_toc.RealTime();
+   MPI_Reduce(&my_rt, &rt_min, 1, MPI_DOUBLE, MPI_MIN, 0, pmesh->GetComm());
+   MPI_Reduce(&my_rt, &rt_max, 1, MPI_DOUBLE, MPI_MAX, 0, pmesh->GetComm());
+   
+   // Print timing results.
+   if (myid == 0)
+   {
+      int cg_iter = cg.GetNumIterations();
+      // Note: In the pcg algorithm, the number of operator Mult() calls is
+      //       N_iter and the number of preconditioner Mult() calls is N_iter+1.
+      cout << '\n'
+           << "Total iterations "<< cg_iter;
+      cout << '\n'
+           << "Total CG time:    " << rt_max << " (" << rt_min << ") sec."
+           << endl;
+      cout << "Time per CG step: "
+           << rt_max / cg_iter << " ("
+           << rt_min / cg_iter << ") sec." << endl;
+      cout << "\n\"DOFs/sec\" in CG: "
+           << 1e-6*size*cg_iter/rt_max << " ("
+           << 1e-6*size*cg_iter/rt_min << ") million.\n"
+           << endl;
+   }
+   delete prec;
+
+   // 14. Recover the parallel grid function corresponding to X. This is the
+   //     local finite element solution on each processor.
+   a->RecoverFEMSolution(X, *b, x);
+
+   // 15. Save the refined mesh and the solution in parallel. This output can
+   //     be viewed later using GLVis: "glvis -np <np> -m mesh -g sol".
+   {
+      //disabled
+   }
+
+   // 16. Send the solution by socket to a GLVis server.
+   //if (visualization)
+   {
+     //disabled
+   }
+
+   // 17. Free the used memory.
+   delete a;
+   delete b;
+   delete fespace;
+   if (order > 0) { delete fec; }
+   delete pmesh;
+
+   MPI_Finalize();
+
+   return 0;
+}
+
+
+Mesh *make_mesh(int myid, int num_procs, int dim, int level,
+                int &par_ref_levels, Array<int> &nxyz)
+{
+   int log_p = (int)floor(log((double)num_procs)/log(2.0) + 0.5);
+   MFEM_VERIFY((1 << log_p) == num_procs,
+               "number of processor is not a power of 2: " << num_procs);
+   MFEM_VERIFY(dim == 3, "dim = " << dim << " is NOT implemented!");
+
+   // Determine processor decomposition.
+   int s[3];
+   s[0] = log_p/3 + (log_p%3 > 0 ? 1 : 0);
+   s[1] = log_p/3 + (log_p%3 > 1 ? 1 : 0);
+   s[2] = log_p/3;
+   nxyz.SetSize(dim);
+   nxyz[0] = 1 << s[0];
+   nxyz[1] = 1 << s[1];
+   nxyz[2] = 1 << s[2];
+
+   // Determine mesh size.
+   int ser_level = level%3;
+   par_ref_levels = level/3;
+   int log_n = log_p + ser_level;
+   int t[3];
+   t[0] = log_n/3 + (log_n%3 > 0 ? 1 : 0);
+   t[1] = log_n/3 + (log_n%3 > 1 ? 1 : 0);
+   t[2] = log_n/3;
+
+   // Create the Mesh.
+   const bool gen_edges = true;
+   const bool sfc_ordering = true;
+   Mesh *mesh = new Mesh(1 << t[0], 1 << t[1], 1 << t[2],
+                         Element::HEXAHEDRON, gen_edges,
+                         1.0, 1.0, 1.0, sfc_ordering);
+   if (myid == 0)
+   {
+      cout << "Processor partitioning: ";
+      nxyz.Print(cout, dim);
+
+      // Mesh dimensions AFTER parallel refinement:
+      cout << "Mesh dimensions: "
+           << (1 << (t[0]+par_ref_levels)) << ' '
+           << (1 << (t[1]+par_ref_levels)) << ' '
+           << (1 << (t[2]+par_ref_levels)) << endl;
+   }
+
+   return mesh;
+}
diff --git a/tests/mfem_amgx/ex1p_amgx_agg_precon.sh b/tests/mfem_amgx/ex1p_amgx_agg_precon.sh
new file mode 100644
index 0000000..595d577
--- /dev/null
+++ b/tests/mfem_amgx/ex1p_amgx_agg_precon.sh
@@ -0,0 +1,85 @@
+# Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
+# the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
+# reserved. See files LICENSE and NOTICE for details.
+#
+# This file is part of CEED, a collection of benchmarks, miniapps, software
+# libraries and APIs for efficient high-order finite element and spectral
+# element discretizations for exascale applications. For more information and
+# source code availability see http://github.com/ceed.
+#
+# The CEED research is supported by the Exascale Computing Project
+# (17-SC-20-SC), a collaborative effort of two U.S. Department of Energy
+# organizations (Office of Science and the National Nuclear Security
+# Administration) responsible for the planning and preparation of a capable
+# exascale ecosystem, including software, applications, hardware, advanced
+# system engineering and early testbed platforms, in support of the nation's
+# exascale computing imperative.
+
+problem=${problem:-0}
+
+function build_tests()
+{
+   local make_opts=(
+      "-j" "$num_proc_build"
+      "MFEM_DIR=$MFEM_DIR"
+      "BLD=$test_exe_dir/")
+   quoted_echo make "${make_opts[@]}"
+   [[ -n "$dry_run" ]] || make "${make_opts[@]}"
+}
+
+
+function run_tests()
+{
+   local test_name="ex1p_amgx_agg_precon"
+   set_mpi_options
+   # 'min_p' can be set on the command line
+   local l_min_p=${min_p:-1}
+   # 'max_p' can be set on the command line
+   local l_max_p=${max_p:-4}
+   # 'max_dofs_proc' can be set on the command line
+   local l_max_dofs_proc=${max_dofs_proc:-4200000}
+   # 'mfem_devs' can be set on the command line
+   local l_mfem_devs=(${mfem_devs:-cuda})
+   local dim=3
+   local args=
+   for dev in ${l_mfem_devs[@]}; do
+      for ((p = l_min_p; p <= l_max_p; p++)) do
+         for ((l = 0; (p**dim)*(2**l) <= l_max_dofs_proc; l++)) do
+            args=(-o $p -l $l -d $dev)
+            if [ -z "$dry_run" ]; then
+               echo "Running test:"
+               quoted_echo $mpi_run ./$test_name "${args[@]}"
+               $mpi_run ./$test_name "${args[@]}" || \
+                  printf "\nError in the test, error code: $?\n\n"
+            else
+               $dry_run $mpi_run ./$test_name "${args[@]}"
+            fi
+         done
+      done
+   done
+}
+
+
+function build_and_run_tests()
+{
+   $dry_run cd "$test_dir"
+
+   build_tests || return 1
+   echo
+
+   [[ -n "$build_only" ]] && return
+
+   $dry_run cd "$test_exe_dir"
+   run_tests
+}
+
+
+mfem_branch=${mfem_branch:-master}
+libceed_branch=${libceed_branch:-master}
+
+# Uncomment the next line to enable 64-bit HYPRE_Int:
+# hypre_big_int=1
+
+packages=${packages:-cuda metis amgx hypre mfem}
+
+test_required_packages=${packages}
diff --git a/tests/mfem_amgx/ex1p_amgx_agg_precon_l1_jacobi.cpp b/tests/mfem_amgx/ex1p_amgx_agg_precon_l1_jacobi.cpp
new file mode 100644
index 0000000..9fb62f3
--- /dev/null
+++ b/tests/mfem_amgx/ex1p_amgx_agg_precon_l1_jacobi.cpp
@@ -0,0 +1,322 @@
+//                       MFEM Example 1 - Parallel Version
+//
+// Compile with: make ex1p
+//
+// Description:  This example code demonstrates the use of MFEM to define a
+//               simple finite element discretization of the Laplace problem
+//               -Delta u = 1 with homogeneous Dirichlet boundary conditions.
+//               Specifically, we discretize using a FE space of the specified
+//               order, or if order < 1 using an isoparametric/isogeometric
+//               space (i.e. quadratic for quadratic curvilinear mesh, NURBS for
+//               NURBS mesh, etc.)
+
+#include "mfem.hpp"
+#include <fstream>
+#include <iostream>
+
+using namespace std;
+using namespace mfem;
+
+Mesh *make_mesh(int myid, int num_procs, int dim, int level,
+                int &par_ref_levels, Array<int> &nxyz);
+
+int main(int argc, char *argv[])
+{
+   // 1. Initialize MPI.
+   int num_procs, myid;
+   MPI_Init(&argc, &argv);
+   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
+   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
+
+   // 2. Parse command-line options.
+   int dim = 3;
+   int level = 0;
+   int order = 1;
+   const char *device_config = "cuda";
+
+   OptionsParser args(argc, argv);
+   args.AddOption(&dim, "-dim", "--mesh-dimension",
+                  "Solve 2D or 3D problem.");
+   args.AddOption(&level, "-l", "--refinement-level",
+                  "Set the problem size: 2^level mesh elements per processor.");
+   args.AddOption(&order, "-o", "--order",
+                  "Finite element order (polynomial degree).");
+   args.AddOption(&device_config, "-d", "--device",
+                  "Device configuration string, see Device::Configure().");
+   args.Parse();
+   if (!args.Good())
+   {
+      if (myid == 0)
+      {
+         args.PrintUsage(cout);
+      }
+      MPI_Finalize();
+      return 1;
+   }
+   if (myid == 0)
+   {
+      args.PrintOptions(cout);
+   }
+
+   // 3. Enable hardware devices such as GPUs, and programming models such as
+   //    CUDA, OCCA, RAJA and OpenMP based on command line options.
+   Device device(device_config);
+   if (myid == 0) { device.Print(); }
+
+   // 4. Read the (serial) mesh from the given mesh file on all processors.  We
+   //    can handle triangular, quadrilateral, tetrahedral, hexahedral, surface
+   //    and volume meshes with the same code.
+   int par_ref_levels;
+   Array<int> nxyz;
+   Mesh *mesh = make_mesh(myid, num_procs, dim, level, par_ref_levels, nxyz);
+
+   // 5. Refine the serial mesh on all processors to increase the resolution. In
+   //    this example we do 'ref_levels' of uniform refinement. We choose
+   //    'ref_levels' to be the largest number that gives a final mesh with no
+   //    more than 10,000 elements.
+   {
+     // Disabled
+   }
+
+   // 6. Define a parallel mesh by a partitioning of the serial mesh. Refine
+   //    this mesh further in parallel to increase the resolution. Once the
+   //    parallel mesh is defined, the serial mesh can be deleted.
+   int *partitioning = nxyz.Size() ? mesh->CartesianPartitioning(nxyz) : NULL;
+   ParMesh *pmesh = new ParMesh(MPI_COMM_WORLD, *mesh, partitioning);
+   delete [] partitioning;
+   delete mesh;
+   for (int l = 0; l < par_ref_levels; l++)
+   {
+      pmesh->UniformRefinement();
+   }
+   // pmesh->PrintInfo();
+   long global_ne = pmesh->ReduceInt(pmesh->GetNE());
+   if (myid == 0)
+   {
+      cout << "Total number of elements: " << global_ne << endl;
+   }
+
+   // 7. Define a parallel finite element space on the parallel mesh. Here we
+   //    use continuous Lagrange finite elements of the specified order. If
+   //    order < 1, we instead use an isoparametric/isogeometric space.
+   FiniteElementCollection *fec;
+   MFEM_VERIFY(order > 0, "invalid 'order': " << order);
+   fec = new H1_FECollection(order, dim);
+   ParFiniteElementSpace *fespace = new ParFiniteElementSpace(pmesh, fec);
+   HYPRE_Int size = fespace->GlobalTrueVSize();
+   if (myid == 0)
+   {
+      cout << "Number of finite element unknowns: " << size << endl;
+   }
+
+   // 8. Determine the list of true (i.e. parallel conforming) essential
+   //    boundary dofs. In this example, the boundary conditions are defined
+   //    by marking all the boundary attributes from the mesh as essential
+   //    (Dirichlet) and converting them to a list of true dofs.
+   Array<int> ess_tdof_list;
+   if (pmesh->bdr_attributes.Size())
+   {
+      Array<int> ess_bdr(pmesh->bdr_attributes.Max());
+      ess_bdr = 1;
+      fespace->GetEssentialTrueDofs(ess_bdr, ess_tdof_list);
+   }
+
+   // 9. Set up the parallel linear form b(.) which corresponds to the
+   //    right-hand side of the FEM linear system, which in this case is
+   //    (1,phi_i) where phi_i are the basis functions in fespace.
+   ParLinearForm *b = new ParLinearForm(fespace);
+   ConstantCoefficient one(1.0);
+   b->AddDomainIntegrator(new DomainLFIntegrator(one));
+   b->Assemble();
+
+   // 10. Define the solution vector x as a parallel finite element grid function
+   //     corresponding to fespace. Initialize x with initial guess of zero,
+   //     which satisfies the boundary conditions.
+   ParGridFunction x(fespace);
+   x = 0.0;
+
+   // 11. Set up the parallel bilinear form a(.,.) on the finite element space
+   //     corresponding to the Laplacian operator -Delta, by adding the Diffusion
+   //     domain integrator.
+   ParBilinearForm *a = new ParBilinearForm(fespace);
+   a->AddDomainIntegrator(new DiffusionIntegrator(one));
+
+   // 12. Assemble the parallel bilinear form and the corresponding linear
+   //     system, applying any necessary transformations such as: parallel
+   //     assembly, eliminating boundary conditions, applying conforming
+   //     constraints for non-conforming AMR, static condensation, etc.
+   a->Assemble();
+
+   OperatorPtr A;
+   Vector B, X;
+   a->FormLinearSystem(ess_tdof_list, x, *b, A, X, B);
+
+   // 13. Solve the linear system A X = B.
+   //     * With full assembly, use the preconditioner from AmgX.
+
+   bool amgx_verbose = true;
+
+   string amgx_config = "{\n"
+                    " \"config_version\": 2, \n"
+                    " \"solver\": { \n"
+                    "   \"algorithm\": \"AGGREGATION\", \n"
+                    "   \"solver\": \"AMG\", \n"
+                    "     \"smoother\": { \n"
+                    "     \"scope\": \"jacobi\", \n"
+                    "     \"solver\": \"JACOBI_L1\" \n"
+                    "       }, \n"
+                    "   \"presweeps\": 1, \n"
+                    "   \"interpolator\": \"D2\", \n"
+                    "   \"selector\": \"SIZE_2\", \n" 
+                    "   \"max_row_sum\" : 0.9, \n"
+                    "   \"strength_threshold\" : 0.25, \n"
+                    "   \"postsweeps\": 1, \n"
+                    "   \"max_iters\": 1, \n"
+                    "   \"cycle\": \"V\"";
+   if (amgx_verbose)
+   {
+     amgx_config = amgx_config + ",\n"
+                   "   \"obtain_timings\": 1 \n";
+   }
+   else
+   {
+     amgx_config = amgx_config + "\n";
+   }
+   amgx_config = amgx_config + " }\n" + "}\n";
+
+   AmgXSolver *prec = new AmgXSolver;
+   prec->SetConvergenceCheck(false);
+   prec->ReadParameters(amgx_config, AmgXSolver::INTERNAL);
+   prec->InitExclusiveGPU(MPI_COMM_WORLD);
+
+   CGSolver cg(MPI_COMM_WORLD);
+   const int max_cg_iter = 200;
+   const int cg_print_level = 3;
+   cg.SetRelTol(1e-12);
+   cg.SetMaxIter(max_cg_iter);
+   cg.SetPrintLevel(cg_print_level);
+   if (prec) { cg.SetPreconditioner(*prec); }
+   cg.SetOperator(*A);
+
+   // Warm-up CG solve (in case of JIT to avoid timing it)
+   {
+      Vector Xtmp(X);
+      cg.SetMaxIter(2);
+      cg.SetPrintLevel(-1);
+      cg.Mult(B, Xtmp);
+      cg.SetMaxIter(max_cg_iter);
+      cg.SetPrintLevel(cg_print_level);
+   }
+
+   // Sync all ranks
+   MPI_Barrier(pmesh->GetComm());
+   
+   // Start CG timing.
+   tic_toc.Clear();
+   
+   // Start & Stop CG timing.
+   tic_toc.Start();
+   cg.Mult(B, X);
+   tic_toc.Stop();
+   double rt_min, rt_max, my_rt;
+   my_rt = tic_toc.RealTime();
+   MPI_Reduce(&my_rt, &rt_min, 1, MPI_DOUBLE, MPI_MIN, 0, pmesh->GetComm());
+   MPI_Reduce(&my_rt, &rt_max, 1, MPI_DOUBLE, MPI_MAX, 0, pmesh->GetComm());
+   
+   // Print timing results.
+   if (myid == 0)
+   {
+      int cg_iter = cg.GetNumIterations();
+      // Note: In the pcg algorithm, the number of operator Mult() calls is
+      //       N_iter and the number of preconditioner Mult() calls is N_iter+1.
+      cout << '\n'
+           << "Total iterations "<< cg_iter;
+      cout << '\n'
+           << "Total CG time:    " << rt_max << " (" << rt_min << ") sec."
+           << endl;
+      cout << "Time per CG step: "
+           << rt_max / cg_iter << " ("
+           << rt_min / cg_iter << ") sec." << endl;
+      cout << "\n\"DOFs/sec\" in CG: "
+           << 1e-6*size*cg_iter/rt_max << " ("
+           << 1e-6*size*cg_iter/rt_min << ") million.\n"
+           << endl;
+   }
+   delete prec;
+
+   // 14. Recover the parallel grid function corresponding to X. This is the
+   //     local finite element solution on each processor.
+   a->RecoverFEMSolution(X, *b, x);
+
+   // 15. Save the refined mesh and the solution in parallel. This output can
+   //     be viewed later using GLVis: "glvis -np <np> -m mesh -g sol".
+   {
+      //disabled
+   }
+
+   // 16. Send the solution by socket to a GLVis server.
+   //if (visualization)
+   {
+     //disabled
+   }
+
+   // 17. Free the used memory.
+   delete a;
+   delete b;
+   delete fespace;
+   if (order > 0) { delete fec; }
+   delete pmesh;
+
+   MPI_Finalize();
+
+   return 0;
+}
+
+
+Mesh *make_mesh(int myid, int num_procs, int dim, int level,
+                int &par_ref_levels, Array<int> &nxyz)
+{
+   int log_p = (int)floor(log((double)num_procs)/log(2.0) + 0.5);
+   MFEM_VERIFY((1 << log_p) == num_procs,
+               "number of processor is not a power of 2: " << num_procs);
+   MFEM_VERIFY(dim == 3, "dim = " << dim << " is NOT implemented!");
+
+   // Determine processor decomposition.
+   int s[3];
+   s[0] = log_p/3 + (log_p%3 > 0 ? 1 : 0);
+   s[1] = log_p/3 + (log_p%3 > 1 ? 1 : 0);
+   s[2] = log_p/3;
+   nxyz.SetSize(dim);
+   nxyz[0] = 1 << s[0];
+   nxyz[1] = 1 << s[1];
+   nxyz[2] = 1 << s[2];
+
+   // Determine mesh size.
+   int ser_level = level%3;
+   par_ref_levels = level/3;
+   int log_n = log_p + ser_level;
+   int t[3];
+   t[0] = log_n/3 + (log_n%3 > 0 ? 1 : 0);
+   t[1] = log_n/3 + (log_n%3 > 1 ? 1 : 0);
+   t[2] = log_n/3;
+
+   // Create the Mesh.
+   const bool gen_edges = true;
+   const bool sfc_ordering = true;
+   Mesh *mesh = new Mesh(1 << t[0], 1 << t[1], 1 << t[2],
+                         Element::HEXAHEDRON, gen_edges,
+                         1.0, 1.0, 1.0, sfc_ordering);
+   if (myid == 0)
+   {
+      cout << "Processor partitioning: ";
+      nxyz.Print(cout, dim);
+
+      // Mesh dimensions AFTER parallel refinement:
+      cout << "Mesh dimensions: "
+           << (1 << (t[0]+par_ref_levels)) << ' '
+           << (1 << (t[1]+par_ref_levels)) << ' '
+           << (1 << (t[2]+par_ref_levels)) << endl;
+   }
+
+   return mesh;
+}
diff --git a/tests/mfem_amgx/ex1p_amgx_agg_precon_l1_jacobi.sh b/tests/mfem_amgx/ex1p_amgx_agg_precon_l1_jacobi.sh
new file mode 100644
index 0000000..e331554
--- /dev/null
+++ b/tests/mfem_amgx/ex1p_amgx_agg_precon_l1_jacobi.sh
@@ -0,0 +1,85 @@
+# Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
+# the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
+# reserved. See files LICENSE and NOTICE for details.
+#
+# This file is part of CEED, a collection of benchmarks, miniapps, software
+# libraries and APIs for efficient high-order finite element and spectral
+# element discretizations for exascale applications. For more information and
+# source code availability see http://github.com/ceed.
+#
+# The CEED research is supported by the Exascale Computing Project
+# (17-SC-20-SC), a collaborative effort of two U.S. Department of Energy
+# organizations (Office of Science and the National Nuclear Security
+# Administration) responsible for the planning and preparation of a capable
+# exascale ecosystem, including software, applications, hardware, advanced
+# system engineering and early testbed platforms, in support of the nation's
+# exascale computing imperative.
+
+problem=${problem:-0}
+
+function build_tests()
+{
+   local make_opts=(
+      "-j" "$num_proc_build"
+      "MFEM_DIR=$MFEM_DIR"
+      "BLD=$test_exe_dir/")
+   quoted_echo make "${make_opts[@]}"
+   [[ -n "$dry_run" ]] || make "${make_opts[@]}"
+}
+
+
+function run_tests()
+{
+   local test_name="ex1p_amgx_agg_precon_l1_jacobi"
+   set_mpi_options
+   # 'min_p' can be set on the command line
+   local l_min_p=${min_p:-1}
+   # 'max_p' can be set on the command line
+   local l_max_p=${max_p:-4}
+   # 'max_dofs_proc' can be set on the command line
+   local l_max_dofs_proc=${max_dofs_proc:-4200000}
+   # 'mfem_devs' can be set on the command line
+   local l_mfem_devs=(${mfem_devs:-cuda})
+   local dim=3
+   local args=
+   for dev in ${l_mfem_devs[@]}; do
+      for ((p = l_min_p; p <= l_max_p; p++)) do
+         for ((l = 0; (p**dim)*(2**l) <= l_max_dofs_proc; l++)) do
+            args=(-o $p -l $l -d $dev)
+            if [ -z "$dry_run" ]; then
+               echo "Running test:"
+               quoted_echo $mpi_run ./$test_name "${args[@]}"
+               $mpi_run ./$test_name "${args[@]}" || \
+                  printf "\nError in the test, error code: $?\n\n"
+            else
+               $dry_run $mpi_run ./$test_name "${args[@]}"
+            fi
+         done
+      done
+   done
+}
+
+
+function build_and_run_tests()
+{
+   $dry_run cd "$test_dir"
+
+   build_tests || return 1
+   echo
+
+   [[ -n "$build_only" ]] && return
+
+   $dry_run cd "$test_exe_dir"
+   run_tests
+}
+
+
+mfem_branch=${mfem_branch:-master}
+libceed_branch=${libceed_branch:-master}
+
+# Uncomment the next line to enable 64-bit HYPRE_Int:
+# hypre_big_int=1
+
+packages=${packages:-cuda metis amgx hypre mfem}
+
+test_required_packages=${packages}
diff --git a/tests/mfem_amgx/ex1p_amgx_agg_solver.cpp b/tests/mfem_amgx/ex1p_amgx_agg_solver.cpp
new file mode 100644
index 0000000..8b10142
--- /dev/null
+++ b/tests/mfem_amgx/ex1p_amgx_agg_solver.cpp
@@ -0,0 +1,323 @@
+//                       MFEM Example 1 - Parallel Version
+//
+// Compile with: make ex1p
+//
+// Description:  This example code demonstrates the use of MFEM to define a
+//               simple finite element discretization of the Laplace problem
+//               -Delta u = 1 with homogeneous Dirichlet boundary conditions.
+//               Specifically, we discretize using a FE space of the specified
+//               order, or if order < 1 using an isoparametric/isogeometric
+//               space (i.e. quadratic for quadratic curvilinear mesh, NURBS for
+//               NURBS mesh, etc.)
+
+#include "mfem.hpp"
+#include <fstream>
+#include <iostream>
+
+using namespace std;
+using namespace mfem;
+
+Mesh *make_mesh(int myid, int num_procs, int dim, int level,
+                int &par_ref_levels, Array<int> &nxyz);
+
+int main(int argc, char *argv[])
+{
+   // 1. Initialize MPI.
+   int num_procs, myid;
+   MPI_Init(&argc, &argv);
+   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
+   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
+
+   // 2. Parse command-line options.
+   int dim = 3;
+   int level = 0;
+   int order = 1;
+   const char *device_config = "cuda";
+
+   OptionsParser args(argc, argv);
+   args.AddOption(&dim, "-dim", "--mesh-dimension",
+                  "Solve 2D or 3D problem.");
+   args.AddOption(&level, "-l", "--refinement-level",
+                  "Set the problem size: 2^level mesh elements per processor.");
+   args.AddOption(&order, "-o", "--order",
+                  "Finite element order (polynomial degree).");
+   args.AddOption(&device_config, "-d", "--device",
+                  "Device configuration string, see Device::Configure().");
+   args.Parse();
+   if (!args.Good())
+   {
+      if (myid == 0)
+      {
+         args.PrintUsage(cout);
+      }
+      MPI_Finalize();
+      return 1;
+   }
+   if (myid == 0)
+   {
+      args.PrintOptions(cout);
+   }
+
+   // 3. Enable hardware devices such as GPUs, and programming models such as
+   //    CUDA, OCCA, RAJA and OpenMP based on command line options.
+   Device device(device_config);
+   if (myid == 0) { device.Print(); }
+
+   // 4. Read the (serial) mesh from the given mesh file on all processors.  We
+   //    can handle triangular, quadrilateral, tetrahedral, hexahedral, surface
+   //    and volume meshes with the same code.
+   int par_ref_levels;
+   Array<int> nxyz;
+   Mesh *mesh = make_mesh(myid, num_procs, dim, level, par_ref_levels, nxyz);
+
+   // 5. Refine the serial mesh on all processors to increase the resolution. In
+   //    this example we do 'ref_levels' of uniform refinement. We choose
+   //    'ref_levels' to be the largest number that gives a final mesh with no
+   //    more than 10,000 elements.
+   {
+     // Disabled
+   }
+
+   // 6. Define a parallel mesh by a partitioning of the serial mesh. Refine
+   //    this mesh further in parallel to increase the resolution. Once the
+   //    parallel mesh is defined, the serial mesh can be deleted.
+   int *partitioning = nxyz.Size() ? mesh->CartesianPartitioning(nxyz) : NULL;
+   ParMesh *pmesh = new ParMesh(MPI_COMM_WORLD, *mesh, partitioning);
+   delete [] partitioning;
+   delete mesh;
+   for (int l = 0; l < par_ref_levels; l++)
+   {
+      pmesh->UniformRefinement();
+   }
+   // pmesh->PrintInfo();
+   long global_ne = pmesh->ReduceInt(pmesh->GetNE());
+   if (myid == 0)
+   {
+      cout << "Total number of elements: " << global_ne << endl;
+   }
+
+   // 7. Define a parallel finite element space on the parallel mesh. Here we
+   //    use continuous Lagrange finite elements of the specified order. If
+   //    order < 1, we instead use an isoparametric/isogeometric space.
+   FiniteElementCollection *fec;
+   MFEM_VERIFY(order > 0, "invalid 'order': " << order);
+   fec = new H1_FECollection(order, dim);
+   ParFiniteElementSpace *fespace = new ParFiniteElementSpace(pmesh, fec);
+   HYPRE_Int size = fespace->GlobalTrueVSize();
+   if (myid == 0)
+   {
+      cout << "Number of finite element unknowns: " << size << endl;
+   }
+
+   // 8. Determine the list of true (i.e. parallel conforming) essential
+   //    boundary dofs. In this example, the boundary conditions are defined
+   //    by marking all the boundary attributes from the mesh as essential
+   //    (Dirichlet) and converting them to a list of true dofs.
+   Array<int> ess_tdof_list;
+   if (pmesh->bdr_attributes.Size())
+   {
+      Array<int> ess_bdr(pmesh->bdr_attributes.Max());
+      ess_bdr = 1;
+      fespace->GetEssentialTrueDofs(ess_bdr, ess_tdof_list);
+   }
+
+   // 9. Set up the parallel linear form b(.) which corresponds to the
+   //    right-hand side of the FEM linear system, which in this case is
+   //    (1,phi_i) where phi_i are the basis functions in fespace.
+   ParLinearForm *b = new ParLinearForm(fespace);
+   ConstantCoefficient one(1.0);
+   b->AddDomainIntegrator(new DomainLFIntegrator(one));
+   b->Assemble();
+
+   // 10. Define the solution vector x as a parallel finite element grid function
+   //     corresponding to fespace. Initialize x with initial guess of zero,
+   //     which satisfies the boundary conditions.
+   ParGridFunction x(fespace);
+   x = 0.0;
+
+   // 11. Set up the parallel bilinear form a(.,.) on the finite element space
+   //     corresponding to the Laplacian operator -Delta, by adding the Diffusion
+   //     domain integrator.
+   ParBilinearForm *a = new ParBilinearForm(fespace);
+   a->AddDomainIntegrator(new DiffusionIntegrator(one));
+
+   // 12. Assemble the parallel bilinear form and the corresponding linear
+   //     system, applying any necessary transformations such as: parallel
+   //     assembly, eliminating boundary conditions, applying conforming
+   //     constraints for non-conforming AMR, static condensation, etc.
+   a->Assemble();
+
+   OperatorPtr A;
+   Vector B, X;
+   a->FormLinearSystem(ess_tdof_list, x, *b, A, X, B);
+
+   // 13. Solve the linear system A X = B.
+   //     * With full assembly, use the preconditioner from AmgX.
+
+   bool amgx_verbose = true;
+
+   string amgx_config = "{ \n"
+                    " \"config_version\": 2, \n"
+                    " \"solver\": { \n"
+                    "   \"preconditioner\": { \n"
+                    "    \"algorithm\": \"AGGREGATION\", \n"
+                    "     \"solver\": \"AMG\", \n"
+                    "     \"smoother\": { \n"
+                    "     \"scope\": \"jacobi\", \n"
+                    "     \"solver\": \"BLOCK_JACOBI\", \n"
+                    "     \"relaxation_factor\": 0.7 \n"
+                    "       }, \n"
+                    "     \"presweeps\": 1, \n"
+                    "     \"max_iters\": 1, \n"
+                    "     \"interpolator\": \"D2\", \n"
+                    "     \"selector\": \"SIZE_2\", \n" 
+                    "     \"scope\": \"amg\", \n"
+                    "     \"max_levels\": 100, \n"
+                    "     \"cycle\": \"V\", \n"
+                    "     \"postsweeps\": 1 \n"
+                    "    }, \n"
+                    "  \"solver\": \"PCG\", \n"
+                    "  \"max_iters\": 100, \n"
+                    "  \"convergence\": \"RELATIVE_INI_CORE\", \n"
+                    "  \"scope\": \"main\", \n"
+                    "  \"tolerance\": 1e-12, \n"
+                    "  \"norm\": \"L2\" ";
+    if (amgx_verbose)
+    {
+      amgx_config = amgx_config + ", \n"
+                   "        \"obtain_timings\": 1, \n"
+                   "        \"monitor_residual\": 1, \n"
+                   "        \"print_grid_stats\": 1, \n"
+                   "        \"print_solve_stats\": 1 \n";
+    }
+    else
+    {
+      amgx_config = amgx_config + "\n";
+    }
+    amgx_config = amgx_config + "   } \n" + "} \n";
+      
+   AmgXSolver *amgx = new AmgXSolver;
+   amgx->SetConvergenceCheck(true);
+   amgx->ReadParameters(amgx_config, AmgXSolver::INTERNAL);
+   amgx->InitExclusiveGPU(MPI_COMM_WORLD);
+
+   amgx->SetOperator(*A.As<HypreParMatrix>());
+
+   // Warm-up CG solve (in case of JIT to avoid timing it)
+   {
+      Vector Xtmp(X);
+      amgx->Mult(B, Xtmp);
+   }
+
+   // Sync all ranks
+   MPI_Barrier(pmesh->GetComm());
+   
+   // Start CG timing.
+   tic_toc.Clear();
+   
+   // Start & Stop CG timing.
+   tic_toc.Start();
+   amgx->Mult(B, X);
+   tic_toc.Stop();
+   double rt_min, rt_max, my_rt;
+   my_rt = tic_toc.RealTime();
+   MPI_Reduce(&my_rt, &rt_min, 1, MPI_DOUBLE, MPI_MIN, 0, pmesh->GetComm());
+   MPI_Reduce(&my_rt, &rt_max, 1, MPI_DOUBLE, MPI_MAX, 0, pmesh->GetComm());
+   
+   // Print timing results.
+   if (myid == 0)
+   {
+      int cg_iter = amgx->GetNumIterations();
+      // Note: In the pcg algorithm, the number of operator Mult() calls is
+      //       N_iter and the number of preconditioner Mult() calls is N_iter+1.
+      cout << '\n'
+           << "Total iterations "<< cg_iter;
+      cout << '\n'
+           << "Total CG time:    " << rt_max << " (" << rt_min << ") sec."
+           << endl;
+      cout << "Time per CG step: "
+           << rt_max / cg_iter << " ("
+           << rt_min / cg_iter << ") sec." << endl;
+      cout << "\n\"DOFs/sec\" in CG: "
+           << 1e-6*size*cg_iter/rt_max << " ("
+           << 1e-6*size*cg_iter/rt_min << ") million.\n"
+           << endl;
+   }
+   delete amgx;
+
+   // 14. Recover the parallel grid function corresponding to X. This is the
+   //     local finite element solution on each processor.
+   a->RecoverFEMSolution(X, *b, x);
+
+   // 15. Save the refined mesh and the solution in parallel. This output can
+   //     be viewed later using GLVis: "glvis -np <np> -m mesh -g sol".
+   {
+      //disabled
+   }
+
+   // 16. Send the solution by socket to a GLVis server.
+   //if (visualization)
+   {
+     //disabled
+   }
+
+   // 17. Free the used memory.
+   delete a;
+   delete b;
+   delete fespace;
+   if (order > 0) { delete fec; }
+   delete pmesh;
+
+   MPI_Finalize();
+
+   return 0;
+}
+
+
+Mesh *make_mesh(int myid, int num_procs, int dim, int level,
+                int &par_ref_levels, Array<int> &nxyz)
+{
+   int log_p = (int)floor(log((double)num_procs)/log(2.0) + 0.5);
+   MFEM_VERIFY((1 << log_p) == num_procs,
+               "number of processor is not a power of 2: " << num_procs);
+   MFEM_VERIFY(dim == 3, "dim = " << dim << " is NOT implemented!");
+
+   // Determine processor decomposition.
+   int s[3];
+   s[0] = log_p/3 + (log_p%3 > 0 ? 1 : 0);
+   s[1] = log_p/3 + (log_p%3 > 1 ? 1 : 0);
+   s[2] = log_p/3;
+   nxyz.SetSize(dim);
+   nxyz[0] = 1 << s[0];
+   nxyz[1] = 1 << s[1];
+   nxyz[2] = 1 << s[2];
+
+   // Determine mesh size.
+   int ser_level = level%3;
+   par_ref_levels = level/3;
+   int log_n = log_p + ser_level;
+   int t[3];
+   t[0] = log_n/3 + (log_n%3 > 0 ? 1 : 0);
+   t[1] = log_n/3 + (log_n%3 > 1 ? 1 : 0);
+   t[2] = log_n/3;
+
+   // Create the Mesh.
+   const bool gen_edges = true;
+   const bool sfc_ordering = true;
+   Mesh *mesh = new Mesh(1 << t[0], 1 << t[1], 1 << t[2],
+                         Element::HEXAHEDRON, gen_edges,
+                         1.0, 1.0, 1.0, sfc_ordering);
+   if (myid == 0)
+   {
+      cout << "Processor partitioning: ";
+      nxyz.Print(cout, dim);
+
+      // Mesh dimensions AFTER parallel refinement:
+      cout << "Mesh dimensions: "
+           << (1 << (t[0]+par_ref_levels)) << ' '
+           << (1 << (t[1]+par_ref_levels)) << ' '
+           << (1 << (t[2]+par_ref_levels)) << endl;
+   }
+
+   return mesh;
+}
diff --git a/tests/mfem_amgx/ex1p_amgx_agg_solver.sh b/tests/mfem_amgx/ex1p_amgx_agg_solver.sh
new file mode 100644
index 0000000..8a8524c
--- /dev/null
+++ b/tests/mfem_amgx/ex1p_amgx_agg_solver.sh
@@ -0,0 +1,85 @@
+# Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
+# the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
+# reserved. See files LICENSE and NOTICE for details.
+#
+# This file is part of CEED, a collection of benchmarks, miniapps, software
+# libraries and APIs for efficient high-order finite element and spectral
+# element discretizations for exascale applications. For more information and
+# source code availability see http://github.com/ceed.
+#
+# The CEED research is supported by the Exascale Computing Project
+# (17-SC-20-SC), a collaborative effort of two U.S. Department of Energy
+# organizations (Office of Science and the National Nuclear Security
+# Administration) responsible for the planning and preparation of a capable
+# exascale ecosystem, including software, applications, hardware, advanced
+# system engineering and early testbed platforms, in support of the nation's
+# exascale computing imperative.
+
+problem=${problem:-0}
+
+function build_tests()
+{
+   local make_opts=(
+      "-j" "$num_proc_build"
+      "MFEM_DIR=$MFEM_DIR"
+      "BLD=$test_exe_dir/")
+   quoted_echo make "${make_opts[@]}"
+   [[ -n "$dry_run" ]] || make "${make_opts[@]}"
+}
+
+
+function run_tests()
+{
+   local test_name="ex1p_amgx_agg_solver"
+   set_mpi_options
+   # 'min_p' can be set on the command line
+   local l_min_p=${min_p:-1}
+   # 'max_p' can be set on the command line
+   local l_max_p=${max_p:-4}
+   # 'max_dofs_proc' can be set on the command line
+   local l_max_dofs_proc=${max_dofs_proc:-4200000}
+   # 'mfem_devs' can be set on the command line
+   local l_mfem_devs=(${mfem_devs:-cuda})
+   local dim=3
+   local args=
+   for dev in ${l_mfem_devs[@]}; do
+      for ((p = l_min_p; p <= l_max_p; p++)) do
+         for ((l = 0; (p**dim)*(2**l) <= l_max_dofs_proc; l++)) do
+            args=(-o $p -l $l -d $dev)
+            if [ -z "$dry_run" ]; then
+               echo "Running test:"
+               quoted_echo $mpi_run ./$test_name "${args[@]}"
+               $mpi_run ./$test_name "${args[@]}" || \
+                  printf "\nError in the test, error code: $?\n\n"
+            else
+               $dry_run $mpi_run ./$test_name "${args[@]}"
+            fi
+         done
+      done
+   done
+}
+
+
+function build_and_run_tests()
+{
+   $dry_run cd "$test_dir"
+
+   build_tests || return 1
+   echo
+
+   [[ -n "$build_only" ]] && return
+
+   $dry_run cd "$test_exe_dir"
+   run_tests
+}
+
+
+mfem_branch=${mfem_branch:-master}
+libceed_branch=${libceed_branch:-master}
+
+# Uncomment the next line to enable 64-bit HYPRE_Int:
+# hypre_big_int=1
+
+packages=${packages:-cuda metis amgx hypre mfem}
+
+test_required_packages=${packages}
diff --git a/tests/mfem_amgx/ex1p_amgx_agg_solver_l1_jacobi.cpp b/tests/mfem_amgx/ex1p_amgx_agg_solver_l1_jacobi.cpp
new file mode 100644
index 0000000..38cd996
--- /dev/null
+++ b/tests/mfem_amgx/ex1p_amgx_agg_solver_l1_jacobi.cpp
@@ -0,0 +1,322 @@
+//                       MFEM Example 1 - Parallel Version
+//
+// Compile with: make ex1p
+//
+// Description:  This example code demonstrates the use of MFEM to define a
+//               simple finite element discretization of the Laplace problem
+//               -Delta u = 1 with homogeneous Dirichlet boundary conditions.
+//               Specifically, we discretize using a FE space of the specified
+//               order, or if order < 1 using an isoparametric/isogeometric
+//               space (i.e. quadratic for quadratic curvilinear mesh, NURBS for
+//               NURBS mesh, etc.)
+
+#include "mfem.hpp"
+#include <fstream>
+#include <iostream>
+
+using namespace std;
+using namespace mfem;
+
+Mesh *make_mesh(int myid, int num_procs, int dim, int level,
+                int &par_ref_levels, Array<int> &nxyz);
+
+int main(int argc, char *argv[])
+{
+   // 1. Initialize MPI.
+   int num_procs, myid;
+   MPI_Init(&argc, &argv);
+   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
+   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
+
+   // 2. Parse command-line options.
+   int dim = 3;
+   int level = 0;
+   int order = 1;
+   const char *device_config = "cuda";
+
+   OptionsParser args(argc, argv);
+   args.AddOption(&dim, "-dim", "--mesh-dimension",
+                  "Solve 2D or 3D problem.");
+   args.AddOption(&level, "-l", "--refinement-level",
+                  "Set the problem size: 2^level mesh elements per processor.");
+   args.AddOption(&order, "-o", "--order",
+                  "Finite element order (polynomial degree).");
+   args.AddOption(&device_config, "-d", "--device",
+                  "Device configuration string, see Device::Configure().");
+   args.Parse();
+   if (!args.Good())
+   {
+      if (myid == 0)
+      {
+         args.PrintUsage(cout);
+      }
+      MPI_Finalize();
+      return 1;
+   }
+   if (myid == 0)
+   {
+      args.PrintOptions(cout);
+   }
+
+   // 3. Enable hardware devices such as GPUs, and programming models such as
+   //    CUDA, OCCA, RAJA and OpenMP based on command line options.
+   Device device(device_config);
+   if (myid == 0) { device.Print(); }
+
+   // 4. Read the (serial) mesh from the given mesh file on all processors.  We
+   //    can handle triangular, quadrilateral, tetrahedral, hexahedral, surface
+   //    and volume meshes with the same code.
+   int par_ref_levels;
+   Array<int> nxyz;
+   Mesh *mesh = make_mesh(myid, num_procs, dim, level, par_ref_levels, nxyz);
+
+   // 5. Refine the serial mesh on all processors to increase the resolution. In
+   //    this example we do 'ref_levels' of uniform refinement. We choose
+   //    'ref_levels' to be the largest number that gives a final mesh with no
+   //    more than 10,000 elements.
+   {
+     // Disabled
+   }
+
+   // 6. Define a parallel mesh by a partitioning of the serial mesh. Refine
+   //    this mesh further in parallel to increase the resolution. Once the
+   //    parallel mesh is defined, the serial mesh can be deleted.
+   int *partitioning = nxyz.Size() ? mesh->CartesianPartitioning(nxyz) : NULL;
+   ParMesh *pmesh = new ParMesh(MPI_COMM_WORLD, *mesh, partitioning);
+   delete [] partitioning;
+   delete mesh;
+   for (int l = 0; l < par_ref_levels; l++)
+   {
+      pmesh->UniformRefinement();
+   }
+   // pmesh->PrintInfo();
+   long global_ne = pmesh->ReduceInt(pmesh->GetNE());
+   if (myid == 0)
+   {
+      cout << "Total number of elements: " << global_ne << endl;
+   }
+
+   // 7. Define a parallel finite element space on the parallel mesh. Here we
+   //    use continuous Lagrange finite elements of the specified order. If
+   //    order < 1, we instead use an isoparametric/isogeometric space.
+   FiniteElementCollection *fec;
+   MFEM_VERIFY(order > 0, "invalid 'order': " << order);
+   fec = new H1_FECollection(order, dim);
+   ParFiniteElementSpace *fespace = new ParFiniteElementSpace(pmesh, fec);
+   HYPRE_Int size = fespace->GlobalTrueVSize();
+   if (myid == 0)
+   {
+      cout << "Number of finite element unknowns: " << size << endl;
+   }
+
+   // 8. Determine the list of true (i.e. parallel conforming) essential
+   //    boundary dofs. In this example, the boundary conditions are defined
+   //    by marking all the boundary attributes from the mesh as essential
+   //    (Dirichlet) and converting them to a list of true dofs.
+   Array<int> ess_tdof_list;
+   if (pmesh->bdr_attributes.Size())
+   {
+      Array<int> ess_bdr(pmesh->bdr_attributes.Max());
+      ess_bdr = 1;
+      fespace->GetEssentialTrueDofs(ess_bdr, ess_tdof_list);
+   }
+
+   // 9. Set up the parallel linear form b(.) which corresponds to the
+   //    right-hand side of the FEM linear system, which in this case is
+   //    (1,phi_i) where phi_i are the basis functions in fespace.
+   ParLinearForm *b = new ParLinearForm(fespace);
+   ConstantCoefficient one(1.0);
+   b->AddDomainIntegrator(new DomainLFIntegrator(one));
+   b->Assemble();
+
+   // 10. Define the solution vector x as a parallel finite element grid function
+   //     corresponding to fespace. Initialize x with initial guess of zero,
+   //     which satisfies the boundary conditions.
+   ParGridFunction x(fespace);
+   x = 0.0;
+
+   // 11. Set up the parallel bilinear form a(.,.) on the finite element space
+   //     corresponding to the Laplacian operator -Delta, by adding the Diffusion
+   //     domain integrator.
+   ParBilinearForm *a = new ParBilinearForm(fespace);
+   a->AddDomainIntegrator(new DiffusionIntegrator(one));
+
+   // 12. Assemble the parallel bilinear form and the corresponding linear
+   //     system, applying any necessary transformations such as: parallel
+   //     assembly, eliminating boundary conditions, applying conforming
+   //     constraints for non-conforming AMR, static condensation, etc.
+   a->Assemble();
+
+   OperatorPtr A;
+   Vector B, X;
+   a->FormLinearSystem(ess_tdof_list, x, *b, A, X, B);
+
+   // 13. Solve the linear system A X = B.
+   //     * With full assembly, use the preconditioner from AmgX.
+
+   bool amgx_verbose = true;
+
+   string amgx_config = "{ \n"
+                    " \"config_version\": 2, \n"
+                    " \"solver\": { \n"
+                    "   \"preconditioner\": { \n"
+                    "    \"algorithm\": \"AGGREGATION\", \n"
+                    "     \"solver\": \"AMG\", \n"
+                    "     \"smoother\": { \n"
+                    "     \"scope\": \"jacobi\", \n"
+                    "     \"solver\": \"JACOBI_L1\" \n"
+                    "       }, \n"
+                    "     \"presweeps\": 1, \n"
+                    "     \"max_iters\": 1, \n"
+                    "     \"interpolator\": \"D2\", \n"
+                    "     \"selector\": \"SIZE_2\", \n" 
+                    "     \"scope\": \"amg\", \n"
+                    "     \"max_levels\": 100, \n"
+                    "     \"cycle\": \"V\", \n"
+                    "     \"postsweeps\": 1 \n"
+                    "    }, \n"
+                    "  \"solver\": \"PCG\", \n"
+                    "  \"max_iters\": 100, \n"
+                    "  \"convergence\": \"RELATIVE_INI_CORE\", \n"
+                    "  \"scope\": \"main\", \n"
+                    "  \"tolerance\": 1e-12, \n"
+                    "  \"norm\": \"L2\" ";
+    if (amgx_verbose)
+    {
+      amgx_config = amgx_config + ", \n"
+                   "        \"obtain_timings\": 1, \n"
+                   "        \"monitor_residual\": 1, \n"
+                   "        \"print_grid_stats\": 1, \n"
+                   "        \"print_solve_stats\": 1 \n";
+    }
+    else
+    {
+      amgx_config = amgx_config + "\n";
+    }
+    amgx_config = amgx_config + "   } \n" + "} \n";
+      
+   AmgXSolver *amgx = new AmgXSolver;
+   amgx->SetConvergenceCheck(true);
+   amgx->ReadParameters(amgx_config, AmgXSolver::INTERNAL);
+   amgx->InitExclusiveGPU(MPI_COMM_WORLD);
+
+   amgx->SetOperator(*A.As<HypreParMatrix>());
+
+   // Warm-up CG solve (in case of JIT to avoid timing it)
+   {
+      Vector Xtmp(X);
+      amgx->Mult(B, Xtmp);
+   }
+
+   // Sync all ranks
+   MPI_Barrier(pmesh->GetComm());
+   
+   // Start CG timing.
+   tic_toc.Clear();
+   
+   // Start & Stop CG timing.
+   tic_toc.Start();
+   amgx->Mult(B, X);
+   tic_toc.Stop();
+   double rt_min, rt_max, my_rt;
+   my_rt = tic_toc.RealTime();
+   MPI_Reduce(&my_rt, &rt_min, 1, MPI_DOUBLE, MPI_MIN, 0, pmesh->GetComm());
+   MPI_Reduce(&my_rt, &rt_max, 1, MPI_DOUBLE, MPI_MAX, 0, pmesh->GetComm());
+   
+   // Print timing results.
+   if (myid == 0)
+   {
+      int cg_iter = amgx->GetNumIterations();
+      // Note: In the pcg algorithm, the number of operator Mult() calls is
+      //       N_iter and the number of preconditioner Mult() calls is N_iter+1.
+      cout << '\n'
+           << "Total iterations "<< cg_iter;
+      cout << '\n'
+           << "Total CG time:    " << rt_max << " (" << rt_min << ") sec."
+           << endl;
+      cout << "Time per CG step: "
+           << rt_max / cg_iter << " ("
+           << rt_min / cg_iter << ") sec." << endl;
+      cout << "\n\"DOFs/sec\" in CG: "
+           << 1e-6*size*cg_iter/rt_max << " ("
+           << 1e-6*size*cg_iter/rt_min << ") million.\n"
+           << endl;
+   }
+   delete amgx;
+
+   // 14. Recover the parallel grid function corresponding to X. This is the
+   //     local finite element solution on each processor.
+   a->RecoverFEMSolution(X, *b, x);
+
+   // 15. Save the refined mesh and the solution in parallel. This output can
+   //     be viewed later using GLVis: "glvis -np <np> -m mesh -g sol".
+   {
+      //disabled
+   }
+
+   // 16. Send the solution by socket to a GLVis server.
+   //if (visualization)
+   {
+     //disabled
+   }
+
+   // 17. Free the used memory.
+   delete a;
+   delete b;
+   delete fespace;
+   if (order > 0) { delete fec; }
+   delete pmesh;
+
+   MPI_Finalize();
+
+   return 0;
+}
+
+
+Mesh *make_mesh(int myid, int num_procs, int dim, int level,
+                int &par_ref_levels, Array<int> &nxyz)
+{
+   int log_p = (int)floor(log((double)num_procs)/log(2.0) + 0.5);
+   MFEM_VERIFY((1 << log_p) == num_procs,
+               "number of processor is not a power of 2: " << num_procs);
+   MFEM_VERIFY(dim == 3, "dim = " << dim << " is NOT implemented!");
+
+   // Determine processor decomposition.
+   int s[3];
+   s[0] = log_p/3 + (log_p%3 > 0 ? 1 : 0);
+   s[1] = log_p/3 + (log_p%3 > 1 ? 1 : 0);
+   s[2] = log_p/3;
+   nxyz.SetSize(dim);
+   nxyz[0] = 1 << s[0];
+   nxyz[1] = 1 << s[1];
+   nxyz[2] = 1 << s[2];
+
+   // Determine mesh size.
+   int ser_level = level%3;
+   par_ref_levels = level/3;
+   int log_n = log_p + ser_level;
+   int t[3];
+   t[0] = log_n/3 + (log_n%3 > 0 ? 1 : 0);
+   t[1] = log_n/3 + (log_n%3 > 1 ? 1 : 0);
+   t[2] = log_n/3;
+
+   // Create the Mesh.
+   const bool gen_edges = true;
+   const bool sfc_ordering = true;
+   Mesh *mesh = new Mesh(1 << t[0], 1 << t[1], 1 << t[2],
+                         Element::HEXAHEDRON, gen_edges,
+                         1.0, 1.0, 1.0, sfc_ordering);
+   if (myid == 0)
+   {
+      cout << "Processor partitioning: ";
+      nxyz.Print(cout, dim);
+
+      // Mesh dimensions AFTER parallel refinement:
+      cout << "Mesh dimensions: "
+           << (1 << (t[0]+par_ref_levels)) << ' '
+           << (1 << (t[1]+par_ref_levels)) << ' '
+           << (1 << (t[2]+par_ref_levels)) << endl;
+   }
+
+   return mesh;
+}
diff --git a/tests/mfem_amgx/ex1p_amgx_agg_solver_l1_jacobi.sh b/tests/mfem_amgx/ex1p_amgx_agg_solver_l1_jacobi.sh
new file mode 100644
index 0000000..d41dea0
--- /dev/null
+++ b/tests/mfem_amgx/ex1p_amgx_agg_solver_l1_jacobi.sh
@@ -0,0 +1,85 @@
+# Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
+# the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
+# reserved. See files LICENSE and NOTICE for details.
+#
+# This file is part of CEED, a collection of benchmarks, miniapps, software
+# libraries and APIs for efficient high-order finite element and spectral
+# element discretizations for exascale applications. For more information and
+# source code availability see http://github.com/ceed.
+#
+# The CEED research is supported by the Exascale Computing Project
+# (17-SC-20-SC), a collaborative effort of two U.S. Department of Energy
+# organizations (Office of Science and the National Nuclear Security
+# Administration) responsible for the planning and preparation of a capable
+# exascale ecosystem, including software, applications, hardware, advanced
+# system engineering and early testbed platforms, in support of the nation's
+# exascale computing imperative.
+
+problem=${problem:-0}
+
+function build_tests()
+{
+   local make_opts=(
+      "-j" "$num_proc_build"
+      "MFEM_DIR=$MFEM_DIR"
+      "BLD=$test_exe_dir/")
+   quoted_echo make "${make_opts[@]}"
+   [[ -n "$dry_run" ]] || make "${make_opts[@]}"
+}
+
+
+function run_tests()
+{
+   local test_name="ex1p_amgx_agg_solver_l1_jacobi"
+   set_mpi_options
+   # 'min_p' can be set on the command line
+   local l_min_p=${min_p:-1}
+   # 'max_p' can be set on the command line
+   local l_max_p=${max_p:-4}
+   # 'max_dofs_proc' can be set on the command line
+   local l_max_dofs_proc=${max_dofs_proc:-4200000}
+   # 'mfem_devs' can be set on the command line
+   local l_mfem_devs=(${mfem_devs:-cuda})
+   local dim=3
+   local args=
+   for dev in ${l_mfem_devs[@]}; do
+      for ((p = l_min_p; p <= l_max_p; p++)) do
+         for ((l = 0; (p**dim)*(2**l) <= l_max_dofs_proc; l++)) do
+            args=(-o $p -l $l -d $dev)
+            if [ -z "$dry_run" ]; then
+               echo "Running test:"
+               quoted_echo $mpi_run ./$test_name "${args[@]}"
+               $mpi_run ./$test_name "${args[@]}" || \
+                  printf "\nError in the test, error code: $?\n\n"
+            else
+               $dry_run $mpi_run ./$test_name "${args[@]}"
+            fi
+         done
+      done
+   done
+}
+
+
+function build_and_run_tests()
+{
+   $dry_run cd "$test_dir"
+
+   build_tests || return 1
+   echo
+
+   [[ -n "$build_only" ]] && return
+
+   $dry_run cd "$test_exe_dir"
+   run_tests
+}
+
+
+mfem_branch=${mfem_branch:-master}
+libceed_branch=${libceed_branch:-master}
+
+# Uncomment the next line to enable 64-bit HYPRE_Int:
+# hypre_big_int=1
+
+packages=${packages:-cuda metis amgx hypre mfem}
+
+test_required_packages=${packages}
diff --git a/tests/mfem_amgx/ex1p_amgx_precon.cpp b/tests/mfem_amgx/ex1p_amgx_precon.cpp
index 3c69958..29ef60f 100644
--- a/tests/mfem_amgx/ex1p_amgx_precon.cpp
+++ b/tests/mfem_amgx/ex1p_amgx_precon.cpp
@@ -154,7 +154,7 @@ int main(int argc, char *argv[])
    // 13. Solve the linear system A X = B.
    //     * With full assembly, use the preconditioner from AmgX.
 
-   bool amgx_verbose = false;
+   bool amgx_verbose = true;
 
    string amgx_config = "{\n"
                     " \"config_version\": 2, \n"
@@ -171,15 +171,11 @@ int main(int argc, char *argv[])
                     "   \"strength_threshold\" : 0.25, \n"
                     "   \"postsweeps\": 1, \n"
                     "   \"max_iters\": 1, \n"
-                    "   \"convergence\": \"ABSOLUTE\", \n"
                     "   \"cycle\": \"V\"";
    if (amgx_verbose)
    {
      amgx_config = amgx_config + ",\n"
-                    "   \"obtain_timings\": 1, \n"
-                    "   \"monitor_residual\": 1, \n"
-                    "   \"print_grid_stats\": 1, \n"
-                    "   \"print_solve_stats\": 1 \n";
+                   "   \"obtain_timings\": 1 \n";
    }
    else
    {
@@ -188,7 +184,7 @@ int main(int argc, char *argv[])
    amgx_config = amgx_config + " }\n" + "}\n";
 
    AmgXSolver *prec = new AmgXSolver;
-   prec->ConfigureAs(AmgXSolver::PRECONDITIONER);
+   prec->SetConvergenceCheck(false);
    prec->ReadParameters(amgx_config, AmgXSolver::INTERNAL);
    prec->InitExclusiveGPU(MPI_COMM_WORLD);
 
@@ -232,6 +228,8 @@ int main(int argc, char *argv[])
       int cg_iter = cg.GetNumIterations();
       // Note: In the pcg algorithm, the number of operator Mult() calls is
       //       N_iter and the number of preconditioner Mult() calls is N_iter+1.
+      cout << '\n'
+           << "Total iterations "<< cg_iter;
       cout << '\n'
            << "Total CG time:    " << rt_max << " (" << rt_min << ") sec."
            << endl;
diff --git a/tests/mfem_amgx/ex1p_amgx_precon.sh b/tests/mfem_amgx/ex1p_amgx_precon.sh
index a16279d..f2c6e02 100644
--- a/tests/mfem_amgx/ex1p_amgx_precon.sh
+++ b/tests/mfem_amgx/ex1p_amgx_precon.sh
@@ -30,7 +30,7 @@ function build_tests()
 
 function run_tests()
 {
-   local test_name="ex1p_amgx"
+   local test_name="ex1p_amgx_precon"
    set_mpi_options
    # 'min_p' can be set on the command line
    local l_min_p=${min_p:-1}
@@ -74,7 +74,7 @@ function build_and_run_tests()
 }
 
 
-mfem_branch=${mfem_branch:-amgx/artv3/configure}
+mfem_branch=${mfem_branch:-master}
 libceed_branch=${libceed_branch:-master}
 
 # Uncomment the next line to enable 64-bit HYPRE_Int:
diff --git a/tests/mfem_amgx/ex1p_amgx_precon_l1_jacobi.cpp b/tests/mfem_amgx/ex1p_amgx_precon_l1_jacobi.cpp
new file mode 100644
index 0000000..174373d
--- /dev/null
+++ b/tests/mfem_amgx/ex1p_amgx_precon_l1_jacobi.cpp
@@ -0,0 +1,320 @@
+//                       MFEM Example 1 - Parallel Version
+//
+// Compile with: make ex1p
+//
+// Description:  This example code demonstrates the use of MFEM to define a
+//               simple finite element discretization of the Laplace problem
+//               -Delta u = 1 with homogeneous Dirichlet boundary conditions.
+//               Specifically, we discretize using a FE space of the specified
+//               order, or if order < 1 using an isoparametric/isogeometric
+//               space (i.e. quadratic for quadratic curvilinear mesh, NURBS for
+//               NURBS mesh, etc.)
+
+#include "mfem.hpp"
+#include <fstream>
+#include <iostream>
+
+using namespace std;
+using namespace mfem;
+
+Mesh *make_mesh(int myid, int num_procs, int dim, int level,
+                int &par_ref_levels, Array<int> &nxyz);
+
+int main(int argc, char *argv[])
+{
+   // 1. Initialize MPI.
+   int num_procs, myid;
+   MPI_Init(&argc, &argv);
+   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
+   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
+
+   // 2. Parse command-line options.
+   int dim = 3;
+   int level = 0;
+   int order = 1;
+   const char *device_config = "cuda";
+
+   OptionsParser args(argc, argv);
+   args.AddOption(&dim, "-dim", "--mesh-dimension",
+                  "Solve 2D or 3D problem.");
+   args.AddOption(&level, "-l", "--refinement-level",
+                  "Set the problem size: 2^level mesh elements per processor.");
+   args.AddOption(&order, "-o", "--order",
+                  "Finite element order (polynomial degree).");
+   args.AddOption(&device_config, "-d", "--device",
+                  "Device configuration string, see Device::Configure().");
+   args.Parse();
+   if (!args.Good())
+   {
+      if (myid == 0)
+      {
+         args.PrintUsage(cout);
+      }
+      MPI_Finalize();
+      return 1;
+   }
+   if (myid == 0)
+   {
+      args.PrintOptions(cout);
+   }
+
+   // 3. Enable hardware devices such as GPUs, and programming models such as
+   //    CUDA, OCCA, RAJA and OpenMP based on command line options.
+   Device device(device_config);
+   if (myid == 0) { device.Print(); }
+
+   // 4. Read the (serial) mesh from the given mesh file on all processors.  We
+   //    can handle triangular, quadrilateral, tetrahedral, hexahedral, surface
+   //    and volume meshes with the same code.
+   int par_ref_levels;
+   Array<int> nxyz;
+   Mesh *mesh = make_mesh(myid, num_procs, dim, level, par_ref_levels, nxyz);
+
+   // 5. Refine the serial mesh on all processors to increase the resolution. In
+   //    this example we do 'ref_levels' of uniform refinement. We choose
+   //    'ref_levels' to be the largest number that gives a final mesh with no
+   //    more than 10,000 elements.
+   {
+     // Disabled
+   }
+
+   // 6. Define a parallel mesh by a partitioning of the serial mesh. Refine
+   //    this mesh further in parallel to increase the resolution. Once the
+   //    parallel mesh is defined, the serial mesh can be deleted.
+   int *partitioning = nxyz.Size() ? mesh->CartesianPartitioning(nxyz) : NULL;
+   ParMesh *pmesh = new ParMesh(MPI_COMM_WORLD, *mesh, partitioning);
+   delete [] partitioning;
+   delete mesh;
+   for (int l = 0; l < par_ref_levels; l++)
+   {
+      pmesh->UniformRefinement();
+   }
+   // pmesh->PrintInfo();
+   long global_ne = pmesh->ReduceInt(pmesh->GetNE());
+   if (myid == 0)
+   {
+      cout << "Total number of elements: " << global_ne << endl;
+   }
+
+   // 7. Define a parallel finite element space on the parallel mesh. Here we
+   //    use continuous Lagrange finite elements of the specified order. If
+   //    order < 1, we instead use an isoparametric/isogeometric space.
+   FiniteElementCollection *fec;
+   MFEM_VERIFY(order > 0, "invalid 'order': " << order);
+   fec = new H1_FECollection(order, dim);
+   ParFiniteElementSpace *fespace = new ParFiniteElementSpace(pmesh, fec);
+   HYPRE_Int size = fespace->GlobalTrueVSize();
+   if (myid == 0)
+   {
+      cout << "Number of finite element unknowns: " << size << endl;
+   }
+
+   // 8. Determine the list of true (i.e. parallel conforming) essential
+   //    boundary dofs. In this example, the boundary conditions are defined
+   //    by marking all the boundary attributes from the mesh as essential
+   //    (Dirichlet) and converting them to a list of true dofs.
+   Array<int> ess_tdof_list;
+   if (pmesh->bdr_attributes.Size())
+   {
+      Array<int> ess_bdr(pmesh->bdr_attributes.Max());
+      ess_bdr = 1;
+      fespace->GetEssentialTrueDofs(ess_bdr, ess_tdof_list);
+   }
+
+   // 9. Set up the parallel linear form b(.) which corresponds to the
+   //    right-hand side of the FEM linear system, which in this case is
+   //    (1,phi_i) where phi_i are the basis functions in fespace.
+   ParLinearForm *b = new ParLinearForm(fespace);
+   ConstantCoefficient one(1.0);
+   b->AddDomainIntegrator(new DomainLFIntegrator(one));
+   b->Assemble();
+
+   // 10. Define the solution vector x as a parallel finite element grid function
+   //     corresponding to fespace. Initialize x with initial guess of zero,
+   //     which satisfies the boundary conditions.
+   ParGridFunction x(fespace);
+   x = 0.0;
+
+   // 11. Set up the parallel bilinear form a(.,.) on the finite element space
+   //     corresponding to the Laplacian operator -Delta, by adding the Diffusion
+   //     domain integrator.
+   ParBilinearForm *a = new ParBilinearForm(fespace);
+   a->AddDomainIntegrator(new DiffusionIntegrator(one));
+
+   // 12. Assemble the parallel bilinear form and the corresponding linear
+   //     system, applying any necessary transformations such as: parallel
+   //     assembly, eliminating boundary conditions, applying conforming
+   //     constraints for non-conforming AMR, static condensation, etc.
+   a->Assemble();
+
+   OperatorPtr A;
+   Vector B, X;
+   a->FormLinearSystem(ess_tdof_list, x, *b, A, X, B);
+
+   // 13. Solve the linear system A X = B.
+   //     * With full assembly, use the preconditioner from AmgX.
+
+   bool amgx_verbose = true;
+
+   string amgx_config = "{\n"
+                    " \"config_version\": 2, \n"
+                    " \"solver\": { \n"
+                    "   \"solver\": \"AMG\", \n"
+                    "     \"smoother\": { \n"
+                    "     \"scope\": \"jacobi\", \n"
+                    "     \"solver\": \"JACOBI_L1\" \n"
+                    "       }, \n"
+                    "   \"presweeps\": 1, \n"
+                    "   \"interpolator\": \"D2\", \n"
+                    "   \"max_row_sum\" : 0.9, \n"
+                    "   \"strength_threshold\" : 0.25, \n"
+                    "   \"postsweeps\": 1, \n"
+                    "   \"max_iters\": 1, \n"
+                    "   \"cycle\": \"V\"";
+   if (amgx_verbose)
+   {
+     amgx_config = amgx_config + ",\n"
+                   "   \"obtain_timings\": 1 \n";
+   }
+   else
+   {
+     amgx_config = amgx_config + "\n";
+   }
+   amgx_config = amgx_config + " }\n" + "}\n";
+
+   AmgXSolver *prec = new AmgXSolver;
+   prec->SetConvergenceCheck(false);
+   prec->ReadParameters(amgx_config, AmgXSolver::INTERNAL);
+   prec->InitExclusiveGPU(MPI_COMM_WORLD);
+
+   CGSolver cg(MPI_COMM_WORLD);
+   const int max_cg_iter = 200;
+   const int cg_print_level = 3;
+   cg.SetRelTol(1e-12);
+   cg.SetMaxIter(max_cg_iter);
+   cg.SetPrintLevel(cg_print_level);
+   if (prec) { cg.SetPreconditioner(*prec); }
+   cg.SetOperator(*A);
+
+   // Warm-up CG solve (in case of JIT to avoid timing it)
+   {
+      Vector Xtmp(X);
+      cg.SetMaxIter(2);
+      cg.SetPrintLevel(-1);
+      cg.Mult(B, Xtmp);
+      cg.SetMaxIter(max_cg_iter);
+      cg.SetPrintLevel(cg_print_level);
+   }
+
+   // Sync all ranks
+   MPI_Barrier(pmesh->GetComm());
+   
+   // Start CG timing.
+   tic_toc.Clear();
+   
+   // Start & Stop CG timing.
+   tic_toc.Start();
+   cg.Mult(B, X);
+   tic_toc.Stop();
+   double rt_min, rt_max, my_rt;
+   my_rt = tic_toc.RealTime();
+   MPI_Reduce(&my_rt, &rt_min, 1, MPI_DOUBLE, MPI_MIN, 0, pmesh->GetComm());
+   MPI_Reduce(&my_rt, &rt_max, 1, MPI_DOUBLE, MPI_MAX, 0, pmesh->GetComm());
+   
+   // Print timing results.
+   if (myid == 0)
+   {
+      int cg_iter = cg.GetNumIterations();
+      // Note: In the pcg algorithm, the number of operator Mult() calls is
+      //       N_iter and the number of preconditioner Mult() calls is N_iter+1.
+      cout << '\n'
+           << "Total iterations "<< cg_iter;
+      cout << '\n'
+           << "Total CG time:    " << rt_max << " (" << rt_min << ") sec."
+           << endl;
+      cout << "Time per CG step: "
+           << rt_max / cg_iter << " ("
+           << rt_min / cg_iter << ") sec." << endl;
+      cout << "\n\"DOFs/sec\" in CG: "
+           << 1e-6*size*cg_iter/rt_max << " ("
+           << 1e-6*size*cg_iter/rt_min << ") million.\n"
+           << endl;
+   }
+   delete prec;
+
+   // 14. Recover the parallel grid function corresponding to X. This is the
+   //     local finite element solution on each processor.
+   a->RecoverFEMSolution(X, *b, x);
+
+   // 15. Save the refined mesh and the solution in parallel. This output can
+   //     be viewed later using GLVis: "glvis -np <np> -m mesh -g sol".
+   {
+      //disabled
+   }
+
+   // 16. Send the solution by socket to a GLVis server.
+   //if (visualization)
+   {
+     //disabled
+   }
+
+   // 17. Free the used memory.
+   delete a;
+   delete b;
+   delete fespace;
+   if (order > 0) { delete fec; }
+   delete pmesh;
+
+   MPI_Finalize();
+
+   return 0;
+}
+
+
+Mesh *make_mesh(int myid, int num_procs, int dim, int level,
+                int &par_ref_levels, Array<int> &nxyz)
+{
+   int log_p = (int)floor(log((double)num_procs)/log(2.0) + 0.5);
+   MFEM_VERIFY((1 << log_p) == num_procs,
+               "number of processor is not a power of 2: " << num_procs);
+   MFEM_VERIFY(dim == 3, "dim = " << dim << " is NOT implemented!");
+
+   // Determine processor decomposition.
+   int s[3];
+   s[0] = log_p/3 + (log_p%3 > 0 ? 1 : 0);
+   s[1] = log_p/3 + (log_p%3 > 1 ? 1 : 0);
+   s[2] = log_p/3;
+   nxyz.SetSize(dim);
+   nxyz[0] = 1 << s[0];
+   nxyz[1] = 1 << s[1];
+   nxyz[2] = 1 << s[2];
+
+   // Determine mesh size.
+   int ser_level = level%3;
+   par_ref_levels = level/3;
+   int log_n = log_p + ser_level;
+   int t[3];
+   t[0] = log_n/3 + (log_n%3 > 0 ? 1 : 0);
+   t[1] = log_n/3 + (log_n%3 > 1 ? 1 : 0);
+   t[2] = log_n/3;
+
+   // Create the Mesh.
+   const bool gen_edges = true;
+   const bool sfc_ordering = true;
+   Mesh *mesh = new Mesh(1 << t[0], 1 << t[1], 1 << t[2],
+                         Element::HEXAHEDRON, gen_edges,
+                         1.0, 1.0, 1.0, sfc_ordering);
+   if (myid == 0)
+   {
+      cout << "Processor partitioning: ";
+      nxyz.Print(cout, dim);
+
+      // Mesh dimensions AFTER parallel refinement:
+      cout << "Mesh dimensions: "
+           << (1 << (t[0]+par_ref_levels)) << ' '
+           << (1 << (t[1]+par_ref_levels)) << ' '
+           << (1 << (t[2]+par_ref_levels)) << endl;
+   }
+
+   return mesh;
+}
diff --git a/tests/mfem_amgx/ex1p_amgx_precon_l1_jacobi.sh b/tests/mfem_amgx/ex1p_amgx_precon_l1_jacobi.sh
new file mode 100644
index 0000000..ffdb5a1
--- /dev/null
+++ b/tests/mfem_amgx/ex1p_amgx_precon_l1_jacobi.sh
@@ -0,0 +1,85 @@
+# Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
+# the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
+# reserved. See files LICENSE and NOTICE for details.
+#
+# This file is part of CEED, a collection of benchmarks, miniapps, software
+# libraries and APIs for efficient high-order finite element and spectral
+# element discretizations for exascale applications. For more information and
+# source code availability see http://github.com/ceed.
+#
+# The CEED research is supported by the Exascale Computing Project
+# (17-SC-20-SC), a collaborative effort of two U.S. Department of Energy
+# organizations (Office of Science and the National Nuclear Security
+# Administration) responsible for the planning and preparation of a capable
+# exascale ecosystem, including software, applications, hardware, advanced
+# system engineering and early testbed platforms, in support of the nation's
+# exascale computing imperative.
+
+problem=${problem:-0}
+
+function build_tests()
+{
+   local make_opts=(
+      "-j" "$num_proc_build"
+      "MFEM_DIR=$MFEM_DIR"
+      "BLD=$test_exe_dir/")
+   quoted_echo make "${make_opts[@]}"
+   [[ -n "$dry_run" ]] || make "${make_opts[@]}"
+}
+
+
+function run_tests()
+{
+   local test_name="ex1p_amgx_precon_l1_jacobi"
+   set_mpi_options
+   # 'min_p' can be set on the command line
+   local l_min_p=${min_p:-1}
+   # 'max_p' can be set on the command line
+   local l_max_p=${max_p:-4}
+   # 'max_dofs_proc' can be set on the command line
+   local l_max_dofs_proc=${max_dofs_proc:-4200000}
+   # 'mfem_devs' can be set on the command line
+   local l_mfem_devs=(${mfem_devs:-cuda})
+   local dim=3
+   local args=
+   for dev in ${l_mfem_devs[@]}; do
+      for ((p = l_min_p; p <= l_max_p; p++)) do
+         for ((l = 0; (p**dim)*(2**l) <= l_max_dofs_proc; l++)) do
+            args=(-o $p -l $l -d $dev)
+            if [ -z "$dry_run" ]; then
+               echo "Running test:"
+               quoted_echo $mpi_run ./$test_name "${args[@]}"
+               $mpi_run ./$test_name "${args[@]}" || \
+                  printf "\nError in the test, error code: $?\n\n"
+            else
+               $dry_run $mpi_run ./$test_name "${args[@]}"
+            fi
+         done
+      done
+   done
+}
+
+
+function build_and_run_tests()
+{
+   $dry_run cd "$test_dir"
+
+   build_tests || return 1
+   echo
+
+   [[ -n "$build_only" ]] && return
+
+   $dry_run cd "$test_exe_dir"
+   run_tests
+}
+
+
+mfem_branch=${mfem_branch:-master}
+libceed_branch=${libceed_branch:-master}
+
+# Uncomment the next line to enable 64-bit HYPRE_Int:
+# hypre_big_int=1
+
+packages=${packages:-cuda metis amgx hypre mfem}
+
+test_required_packages=${packages}
diff --git a/tests/mfem_amgx/ex1p_amgx_solver.cpp b/tests/mfem_amgx/ex1p_amgx_solver.cpp
index 37e25a9..3caddee 100644
--- a/tests/mfem_amgx/ex1p_amgx_solver.cpp
+++ b/tests/mfem_amgx/ex1p_amgx_solver.cpp
@@ -170,7 +170,7 @@ int main(int argc, char *argv[])
                     "     \"interpolator\": \"D2\", \n"
                     "     \"max_row_sum\" : 0.9, \n"
                     "     \"strength_threshold\" : 0.25, \n"
-                    "     \"max_iters\": 2, \n"
+                    "     \"max_iters\": 1, \n"
                     "     \"scope\": \"amg\", \n"
                     "     \"max_levels\": 100, \n"
                     "     \"cycle\": \"V\", \n"
@@ -178,7 +178,7 @@ int main(int argc, char *argv[])
                     "    }, \n"
                     "  \"solver\": \"PCG\", \n"
                     "  \"max_iters\": 100, \n"
-                    "  \"convergence\": \"RELATIVE_MAX\", \n"
+                    "  \"convergence\": \"RELATIVE_INI_CORE\", \n"
                     "  \"scope\": \"main\", \n"
                     "  \"tolerance\": 1e-12, \n"
                     "  \"norm\": \"L2\" ";
@@ -197,7 +197,7 @@ int main(int argc, char *argv[])
     amgx_config = amgx_config + "   } \n" + "} \n";
       
    AmgXSolver *amgx = new AmgXSolver;
-   amgx->ConfigureAs(AmgXSolver::SOLVER);
+   amgx->SetConvergenceCheck(true);
    amgx->ReadParameters(amgx_config, AmgXSolver::INTERNAL);
    amgx->InitExclusiveGPU(MPI_COMM_WORLD);
 
diff --git a/tests/mfem_amgx/ex1p_amgx_solver.sh b/tests/mfem_amgx/ex1p_amgx_solver.sh
index 6644dba..67ef6e0 100644
--- a/tests/mfem_amgx/ex1p_amgx_solver.sh
+++ b/tests/mfem_amgx/ex1p_amgx_solver.sh
@@ -74,7 +74,7 @@ function build_and_run_tests()
 }
 
 
-mfem_branch=${mfem_branch:-amgx/artv3/configure}
+mfem_branch=${mfem_branch:-master}
 libceed_branch=${libceed_branch:-master}
 
 # Uncomment the next line to enable 64-bit HYPRE_Int:
diff --git a/tests/mfem_amgx/ex1p_amgx_solver_l1_jacobi.cpp b/tests/mfem_amgx/ex1p_amgx_solver_l1_jacobi.cpp
new file mode 100644
index 0000000..d28aced
--- /dev/null
+++ b/tests/mfem_amgx/ex1p_amgx_solver_l1_jacobi.cpp
@@ -0,0 +1,322 @@
+//                       MFEM Example 1 - Parallel Version
+//
+// Compile with: make ex1p
+//
+// Description:  This example code demonstrates the use of MFEM to define a
+//               simple finite element discretization of the Laplace problem
+//               -Delta u = 1 with homogeneous Dirichlet boundary conditions.
+//               Specifically, we discretize using a FE space of the specified
+//               order, or if order < 1 using an isoparametric/isogeometric
+//               space (i.e. quadratic for quadratic curvilinear mesh, NURBS for
+//               NURBS mesh, etc.)
+
+#include "mfem.hpp"
+#include <fstream>
+#include <iostream>
+
+using namespace std;
+using namespace mfem;
+
+Mesh *make_mesh(int myid, int num_procs, int dim, int level,
+                int &par_ref_levels, Array<int> &nxyz);
+
+int main(int argc, char *argv[])
+{
+   // 1. Initialize MPI.
+   int num_procs, myid;
+   MPI_Init(&argc, &argv);
+   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
+   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
+
+   // 2. Parse command-line options.
+   int dim = 3;
+   int level = 0;
+   int order = 1;
+   const char *device_config = "cuda";
+
+   OptionsParser args(argc, argv);
+   args.AddOption(&dim, "-dim", "--mesh-dimension",
+                  "Solve 2D or 3D problem.");
+   args.AddOption(&level, "-l", "--refinement-level",
+                  "Set the problem size: 2^level mesh elements per processor.");
+   args.AddOption(&order, "-o", "--order",
+                  "Finite element order (polynomial degree).");
+   args.AddOption(&device_config, "-d", "--device",
+                  "Device configuration string, see Device::Configure().");
+   args.Parse();
+   if (!args.Good())
+   {
+      if (myid == 0)
+      {
+         args.PrintUsage(cout);
+      }
+      MPI_Finalize();
+      return 1;
+   }
+   if (myid == 0)
+   {
+      args.PrintOptions(cout);
+   }
+
+   // 3. Enable hardware devices such as GPUs, and programming models such as
+   //    CUDA, OCCA, RAJA and OpenMP based on command line options.
+   Device device(device_config);
+   if (myid == 0) { device.Print(); }
+
+   // 4. Read the (serial) mesh from the given mesh file on all processors.  We
+   //    can handle triangular, quadrilateral, tetrahedral, hexahedral, surface
+   //    and volume meshes with the same code.
+   int par_ref_levels;
+   Array<int> nxyz;
+   Mesh *mesh = make_mesh(myid, num_procs, dim, level, par_ref_levels, nxyz);
+
+   // 5. Refine the serial mesh on all processors to increase the resolution. In
+   //    this example we do 'ref_levels' of uniform refinement. We choose
+   //    'ref_levels' to be the largest number that gives a final mesh with no
+   //    more than 10,000 elements.
+   {
+     // Disabled
+   }
+
+   // 6. Define a parallel mesh by a partitioning of the serial mesh. Refine
+   //    this mesh further in parallel to increase the resolution. Once the
+   //    parallel mesh is defined, the serial mesh can be deleted.
+   int *partitioning = nxyz.Size() ? mesh->CartesianPartitioning(nxyz) : NULL;
+   ParMesh *pmesh = new ParMesh(MPI_COMM_WORLD, *mesh, partitioning);
+   delete [] partitioning;
+   delete mesh;
+   for (int l = 0; l < par_ref_levels; l++)
+   {
+      pmesh->UniformRefinement();
+   }
+   // pmesh->PrintInfo();
+   long global_ne = pmesh->ReduceInt(pmesh->GetNE());
+   if (myid == 0)
+   {
+      cout << "Total number of elements: " << global_ne << endl;
+   }
+
+   // 7. Define a parallel finite element space on the parallel mesh. Here we
+   //    use continuous Lagrange finite elements of the specified order. If
+   //    order < 1, we instead use an isoparametric/isogeometric space.
+   FiniteElementCollection *fec;
+   MFEM_VERIFY(order > 0, "invalid 'order': " << order);
+   fec = new H1_FECollection(order, dim);
+   ParFiniteElementSpace *fespace = new ParFiniteElementSpace(pmesh, fec);
+   HYPRE_Int size = fespace->GlobalTrueVSize();
+   if (myid == 0)
+   {
+      cout << "Number of finite element unknowns: " << size << endl;
+   }
+
+   // 8. Determine the list of true (i.e. parallel conforming) essential
+   //    boundary dofs. In this example, the boundary conditions are defined
+   //    by marking all the boundary attributes from the mesh as essential
+   //    (Dirichlet) and converting them to a list of true dofs.
+   Array<int> ess_tdof_list;
+   if (pmesh->bdr_attributes.Size())
+   {
+      Array<int> ess_bdr(pmesh->bdr_attributes.Max());
+      ess_bdr = 1;
+      fespace->GetEssentialTrueDofs(ess_bdr, ess_tdof_list);
+   }
+
+   // 9. Set up the parallel linear form b(.) which corresponds to the
+   //    right-hand side of the FEM linear system, which in this case is
+   //    (1,phi_i) where phi_i are the basis functions in fespace.
+   ParLinearForm *b = new ParLinearForm(fespace);
+   ConstantCoefficient one(1.0);
+   b->AddDomainIntegrator(new DomainLFIntegrator(one));
+   b->Assemble();
+
+   // 10. Define the solution vector x as a parallel finite element grid function
+   //     corresponding to fespace. Initialize x with initial guess of zero,
+   //     which satisfies the boundary conditions.
+   ParGridFunction x(fespace);
+   x = 0.0;
+
+   // 11. Set up the parallel bilinear form a(.,.) on the finite element space
+   //     corresponding to the Laplacian operator -Delta, by adding the Diffusion
+   //     domain integrator.
+   ParBilinearForm *a = new ParBilinearForm(fespace);
+   a->AddDomainIntegrator(new DiffusionIntegrator(one));
+
+   // 12. Assemble the parallel bilinear form and the corresponding linear
+   //     system, applying any necessary transformations such as: parallel
+   //     assembly, eliminating boundary conditions, applying conforming
+   //     constraints for non-conforming AMR, static condensation, etc.
+   a->Assemble();
+
+   OperatorPtr A;
+   Vector B, X;
+   a->FormLinearSystem(ess_tdof_list, x, *b, A, X, B);
+
+   // 13. Solve the linear system A X = B.
+   //     * With full assembly, use the preconditioner from AmgX.
+
+   bool amgx_verbose = true;
+
+   string amgx_config = "{ \n"
+                    " \"config_version\": 2, \n"
+                    " \"solver\": { \n"
+                    "   \"preconditioner\": { \n"
+                    "     \"solver\": \"AMG\", \n"
+                    "     \"smoother\": { \n"
+                    "     \"scope\": \"jacobi\", \n"
+                    "     \"solver\": \"JACOBI_L1\" \n"
+                    "       }, \n"
+                    "     \"presweeps\": 1, \n"
+                    "     \"interpolator\": \"D2\", \n"
+                    "     \"max_row_sum\" : 0.9, \n"
+                    "     \"strength_threshold\" : 0.25, \n"
+                    "     \"max_iters\": 1, \n"
+                    "     \"scope\": \"amg\", \n"
+                    "     \"max_levels\": 100, \n"
+                    "     \"cycle\": \"V\", \n"
+                    "     \"postsweeps\": 1 \n"
+                    "    }, \n"
+                    "  \"solver\": \"PCG\", \n"
+                    "  \"max_iters\": 100, \n"
+                    "  \"convergence\": \"RELATIVE_INI_CORE\", \n"
+                    "  \"scope\": \"main\", \n"
+                    "  \"tolerance\": 1e-12, \n"
+                    "  \"norm\": \"L2\" ";
+    if (amgx_verbose)
+    {
+      amgx_config = amgx_config + ", \n"
+                   "        \"obtain_timings\": 1, \n"
+                   "        \"monitor_residual\": 1, \n"
+                   "        \"print_grid_stats\": 1, \n"
+                   "        \"print_solve_stats\": 1 \n";
+    }
+    else
+    {
+      amgx_config = amgx_config + "\n";
+    }
+    amgx_config = amgx_config + "   } \n" + "} \n";
+      
+   AmgXSolver *amgx = new AmgXSolver;
+   amgx->SetConvergenceCheck(true);
+   amgx->ReadParameters(amgx_config, AmgXSolver::INTERNAL);
+   amgx->InitExclusiveGPU(MPI_COMM_WORLD);
+
+   amgx->SetOperator(*A.As<HypreParMatrix>());
+
+   // Warm-up CG solve (in case of JIT to avoid timing it)
+   {
+      Vector Xtmp(X);
+      amgx->Mult(B, Xtmp);
+   }
+
+   // Sync all ranks
+   MPI_Barrier(pmesh->GetComm());
+   
+   // Start CG timing.
+   tic_toc.Clear();
+   
+   // Start & Stop CG timing.
+   tic_toc.Start();
+   amgx->Mult(B, X);
+   tic_toc.Stop();
+   double rt_min, rt_max, my_rt;
+   my_rt = tic_toc.RealTime();
+   MPI_Reduce(&my_rt, &rt_min, 1, MPI_DOUBLE, MPI_MIN, 0, pmesh->GetComm());
+   MPI_Reduce(&my_rt, &rt_max, 1, MPI_DOUBLE, MPI_MAX, 0, pmesh->GetComm());
+   
+   // Print timing results.
+   if (myid == 0)
+   {
+      int cg_iter = amgx->GetNumIterations();
+      // Note: In the pcg algorithm, the number of operator Mult() calls is
+      //       N_iter and the number of preconditioner Mult() calls is N_iter+1.
+      cout << '\n'
+           << "Total iterations "<< cg_iter;
+      cout << '\n'
+           << "Total CG time:    " << rt_max << " (" << rt_min << ") sec."
+           << endl;
+      cout << "Time per CG step: "
+           << rt_max / cg_iter << " ("
+           << rt_min / cg_iter << ") sec." << endl;
+      cout << "\n\"DOFs/sec\" in CG: "
+           << 1e-6*size*cg_iter/rt_max << " ("
+           << 1e-6*size*cg_iter/rt_min << ") million.\n"
+           << endl;
+   }
+   delete amgx;
+
+   // 14. Recover the parallel grid function corresponding to X. This is the
+   //     local finite element solution on each processor.
+   a->RecoverFEMSolution(X, *b, x);
+
+   // 15. Save the refined mesh and the solution in parallel. This output can
+   //     be viewed later using GLVis: "glvis -np <np> -m mesh -g sol".
+   {
+      //disabled
+   }
+
+   // 16. Send the solution by socket to a GLVis server.
+   //if (visualization)
+   {
+     //disabled
+   }
+
+   // 17. Free the used memory.
+   delete a;
+   delete b;
+   delete fespace;
+   if (order > 0) { delete fec; }
+   delete pmesh;
+
+   MPI_Finalize();
+
+   return 0;
+}
+
+
+Mesh *make_mesh(int myid, int num_procs, int dim, int level,
+                int &par_ref_levels, Array<int> &nxyz)
+{
+   int log_p = (int)floor(log((double)num_procs)/log(2.0) + 0.5);
+   MFEM_VERIFY((1 << log_p) == num_procs,
+               "number of processor is not a power of 2: " << num_procs);
+   MFEM_VERIFY(dim == 3, "dim = " << dim << " is NOT implemented!");
+
+   // Determine processor decomposition.
+   int s[3];
+   s[0] = log_p/3 + (log_p%3 > 0 ? 1 : 0);
+   s[1] = log_p/3 + (log_p%3 > 1 ? 1 : 0);
+   s[2] = log_p/3;
+   nxyz.SetSize(dim);
+   nxyz[0] = 1 << s[0];
+   nxyz[1] = 1 << s[1];
+   nxyz[2] = 1 << s[2];
+
+   // Determine mesh size.
+   int ser_level = level%3;
+   par_ref_levels = level/3;
+   int log_n = log_p + ser_level;
+   int t[3];
+   t[0] = log_n/3 + (log_n%3 > 0 ? 1 : 0);
+   t[1] = log_n/3 + (log_n%3 > 1 ? 1 : 0);
+   t[2] = log_n/3;
+
+   // Create the Mesh.
+   const bool gen_edges = true;
+   const bool sfc_ordering = true;
+   Mesh *mesh = new Mesh(1 << t[0], 1 << t[1], 1 << t[2],
+                         Element::HEXAHEDRON, gen_edges,
+                         1.0, 1.0, 1.0, sfc_ordering);
+   if (myid == 0)
+   {
+      cout << "Processor partitioning: ";
+      nxyz.Print(cout, dim);
+
+      // Mesh dimensions AFTER parallel refinement:
+      cout << "Mesh dimensions: "
+           << (1 << (t[0]+par_ref_levels)) << ' '
+           << (1 << (t[1]+par_ref_levels)) << ' '
+           << (1 << (t[2]+par_ref_levels)) << endl;
+   }
+
+   return mesh;
+}
diff --git a/tests/mfem_amgx/ex1p_amgx_solver_l1_jacobi.sh b/tests/mfem_amgx/ex1p_amgx_solver_l1_jacobi.sh
new file mode 100644
index 0000000..43d31b3
--- /dev/null
+++ b/tests/mfem_amgx/ex1p_amgx_solver_l1_jacobi.sh
@@ -0,0 +1,85 @@
+# Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
+# the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
+# reserved. See files LICENSE and NOTICE for details.
+#
+# This file is part of CEED, a collection of benchmarks, miniapps, software
+# libraries and APIs for efficient high-order finite element and spectral
+# element discretizations for exascale applications. For more information and
+# source code availability see http://github.com/ceed.
+#
+# The CEED research is supported by the Exascale Computing Project
+# (17-SC-20-SC), a collaborative effort of two U.S. Department of Energy
+# organizations (Office of Science and the National Nuclear Security
+# Administration) responsible for the planning and preparation of a capable
+# exascale ecosystem, including software, applications, hardware, advanced
+# system engineering and early testbed platforms, in support of the nation's
+# exascale computing imperative.
+
+problem=${problem:-0}
+
+function build_tests()
+{
+   local make_opts=(
+      "-j" "$num_proc_build"
+      "MFEM_DIR=$MFEM_DIR"
+      "BLD=$test_exe_dir/")
+   quoted_echo make "${make_opts[@]}"
+   [[ -n "$dry_run" ]] || make "${make_opts[@]}"
+}
+
+
+function run_tests()
+{
+   local test_name="ex1p_amgx_solver_l1_jacobi"
+   set_mpi_options
+   # 'min_p' can be set on the command line
+   local l_min_p=${min_p:-1}
+   # 'max_p' can be set on the command line
+   local l_max_p=${max_p:-4}
+   # 'max_dofs_proc' can be set on the command line
+   local l_max_dofs_proc=${max_dofs_proc:-4200000}
+   # 'mfem_devs' can be set on the command line
+   local l_mfem_devs=(${mfem_devs:-cuda})
+   local dim=3
+   local args=
+   for dev in ${l_mfem_devs[@]}; do
+      for ((p = l_min_p; p <= l_max_p; p++)) do
+         for ((l = 0; (p**dim)*(2**l) <= l_max_dofs_proc; l++)) do
+            args=(-o $p -l $l -d $dev)
+            if [ -z "$dry_run" ]; then
+               echo "Running test:"
+               quoted_echo $mpi_run ./$test_name "${args[@]}"
+               $mpi_run ./$test_name "${args[@]}" || \
+                  printf "\nError in the test, error code: $?\n\n"
+            else
+               $dry_run $mpi_run ./$test_name "${args[@]}"
+            fi
+         done
+      done
+   done
+}
+
+
+function build_and_run_tests()
+{
+   $dry_run cd "$test_dir"
+
+   build_tests || return 1
+   echo
+
+   [[ -n "$build_only" ]] && return
+
+   $dry_run cd "$test_exe_dir"
+   run_tests
+}
+
+
+mfem_branch=${mfem_branch:-master}
+libceed_branch=${libceed_branch:-master}
+
+# Uncomment the next line to enable 64-bit HYPRE_Int:
+# hypre_big_int=1
+
+packages=${packages:-cuda metis amgx hypre mfem}
+
+test_required_packages=${packages}
diff --git a/tests/mfem_amgx/ex1p_amgx_solver_l1_jacobi.sh~ b/tests/mfem_amgx/ex1p_amgx_solver_l1_jacobi.sh~
new file mode 100644
index 0000000..9af721f
--- /dev/null
+++ b/tests/mfem_amgx/ex1p_amgx_solver_l1_jacobi.sh~
@@ -0,0 +1,85 @@
+# Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
+# the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
+# reserved. See files LICENSE and NOTICE for details.
+#
+# This file is part of CEED, a collection of benchmarks, miniapps, software
+# libraries and APIs for efficient high-order finite element and spectral
+# element discretizations for exascale applications. For more information and
+# source code availability see http://github.com/ceed.
+#
+# The CEED research is supported by the Exascale Computing Project
+# (17-SC-20-SC), a collaborative effort of two U.S. Department of Energy
+# organizations (Office of Science and the National Nuclear Security
+# Administration) responsible for the planning and preparation of a capable
+# exascale ecosystem, including software, applications, hardware, advanced
+# system engineering and early testbed platforms, in support of the nation's
+# exascale computing imperative.
+
+problem=${problem:-0}
+
+function build_tests()
+{
+   local make_opts=(
+      "-j" "$num_proc_build"
+      "MFEM_DIR=$MFEM_DIR"
+      "BLD=$test_exe_dir/")
+   quoted_echo make "${make_opts[@]}"
+   [[ -n "$dry_run" ]] || make "${make_opts[@]}"
+}
+
+
+function run_tests()
+{
+   local test_name="ex1p_amgx_precon_l1_jacobi"
+   set_mpi_options
+   # 'min_p' can be set on the command line
+   local l_min_p=${min_p:-1}
+   # 'max_p' can be set on the command line
+   local l_max_p=${max_p:-4}
+   # 'max_dofs_proc' can be set on the command line
+   local l_max_dofs_proc=${max_dofs_proc:-4200000}
+   # 'mfem_devs' can be set on the command line
+   local l_mfem_devs=(${mfem_devs:-cuda})
+   local dim=3
+   local args=
+   for dev in ${l_mfem_devs[@]}; do
+      for ((p = l_min_p; p <= l_max_p; p++)) do
+         for ((l = 0; (p**dim)*(2**l) <= l_max_dofs_proc; l++)) do
+            args=(-o $p -l $l -d $dev)
+            if [ -z "$dry_run" ]; then
+               echo "Running test:"
+               quoted_echo $mpi_run ./$test_name "${args[@]}"
+               $mpi_run ./$test_name "${args[@]}" || \
+                  printf "\nError in the test, error code: $?\n\n"
+            else
+               $dry_run $mpi_run ./$test_name "${args[@]}"
+            fi
+         done
+      done
+   done
+}
+
+
+function build_and_run_tests()
+{
+   $dry_run cd "$test_dir"
+
+   build_tests || return 1
+   echo
+
+   [[ -n "$build_only" ]] && return
+
+   $dry_run cd "$test_exe_dir"
+   run_tests
+}
+
+
+mfem_branch=${mfem_branch:-amgx/artv3/configure}
+libceed_branch=${libceed_branch:-master}
+
+# Uncomment the next line to enable 64-bit HYPRE_Int:
+# hypre_big_int=1
+
+packages=${packages:-cuda metis amgx hypre mfem}
+
+test_required_packages=${packages}
diff --git a/tests/mfem_amgx/makefile b/tests/mfem_amgx/makefile
index 59d7e82..3ba3262 100644
--- a/tests/mfem_amgx/makefile
+++ b/tests/mfem_amgx/makefile
@@ -27,6 +27,12 @@ MFEM_LIB_FILE = mfem_is_not_built
 
 EX1 = $(BLD)ex1p_amgx_precon
 EX2 = $(BLD)ex1p_amgx_solver
+EX3 = $(BLD)ex1p_amgx_agg_solver
+EX4 = $(BLD)ex1p_amgx_agg_precon
+EX5 = $(BLD)ex1p_amgx_precon_l1_jacobi
+EX6 = $(BLD)ex1p_amgx_solver_l1_jacobi
+EX7 = $(BLD)ex1p_amgx_agg_solver_l1_jacobi
+EX8 = $(BLD)ex1p_amgx_agg_precon_l1_jacobi
 ifeq ($(MFEM_USE_MPI),NO)
    $(error A parallel MFEM build is required.)
 endif
@@ -42,7 +48,7 @@ endif
 $(BLD)%: $(SRC)%.cpp $(MFEM_LIB_FILE) $(CONFIG_MK)
 	$(MFEM_CXX) $(MFEM_FLAGS) $< -o $@ $(MFEM_LIBS)
 
-all: $(EX1) $(EX2)
+all: $(EX1) $(EX2) $(EX3) $(EX4) $(EX5) $(EX6) $(EX7) $(EX8)
 
 # Generate an error message if the MFEM library is not built and exit
 $(MFEM_LIB_FILE):