cuda - remove duplicate mats in gen #1739

jeremylt · 2025-02-06T22:16:11Z

This should let us run bigger elements for gen, and its just silly to do work we don't need to do.

Sample output:

// -----------------------------------------------------------------------------
// Operator Kernel
// 
// d_[in,out]_i:   CeedVector device array
// r_[in,out]_e_i: Element vector register
// r_[in,out]_q_i: Quadrature space vector register
// r_[in,out]_c_i: AtPoints Chebyshev coefficients register
// r_[in,out]_s_i: Quadrature space slice vector register
// 
// s_B_[in,out]_i: Interpolation matrix, shared memory
// s_G_[in,out]_i: Gradient matrix, shared memory
// -----------------------------------------------------------------------------
extern "C" __global__ void CeedKernelCudaGenOperator_Poisson3DApply(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda points) {
  const CeedScalar *d_in_0 = fields.inputs[0];
  const CeedScalar *d_in_1 = fields.inputs[1];
  CeedScalar *d_out_0 = fields.outputs[0];
  const CeedInt dim = 3;
  const CeedInt Q_1d = 6;
  extern __shared__ CeedScalar slice[];
  SharedData_Cuda data;
  data.t_id_x = threadIdx.x;
  data.t_id_y = threadIdx.y;
  data.t_id_z = threadIdx.z;
  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;
  data.slice = slice + data.t_id_z*T_1D*T_1D;

  // Input field constants and basis data
  // -- Input field 0
  const CeedInt P_1d_in_0 = 5;
  const CeedInt num_comp_in_0 = 1;
  // EvalMode: gradient
  __shared__ CeedScalar s_B_in_0[P_1d_in_0*Q_1d];
  LoadMatrix<P_1d_in_0, Q_1d>(data, B.inputs[0], s_B_in_0);
  __shared__ CeedScalar s_G_in_0[Q_1d*Q_1d];
  LoadMatrix<Q_1d, Q_1d>(data, G.inputs[0], s_G_in_0);
  // -- Input field 1
  const CeedInt P_1d_in_1 = 6;
  const CeedInt num_comp_in_1 = 6;
  // EvalMode: none

  // Output field constants and basis data
  // -- Output field 0
  const CeedInt P_1d_out_0 = 5;
  const CeedInt num_comp_out_0 = 1;
  // EvalMode: gradient
  CeedScalar *s_B_out_0 = s_B_in_0;
  CeedScalar *s_G_out_0 = s_G_in_0;

  // Element loop
  __syncthreads();
  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {
    // Scratch restriction buffer space
    CeedScalar r_e_scratch[1296];

    // -- Input field restrictions and basis actions
    // ---- Input field 0
    CeedScalar *r_e_in_0 = r_e_scratch;
    const CeedInt l_size_in_0 = 274625;
    // CompStride: 274625
    ReadLVecStandard3d<num_comp_in_0, 274625, P_1d_in_0>(data, l_size_in_0, elem, indices.inputs[0], d_in_0, r_e_in_0);
    // EvalMode: gradient
    CeedScalar r_q_in_0[num_comp_in_0*Q_1d];
    InterpTensor3d<num_comp_in_0, P_1d_in_0, Q_1d>(data, r_e_in_0, s_B_in_0, r_q_in_0);
    // ---- Input field 1
    CeedScalar r_e_in_1[num_comp_in_1*P_1d_in_1];
    // Strides: {1, 884736, 216}
    ReadLVecStrided3d<num_comp_in_1, P_1d_in_1, 1, 884736, 216>(data, elem, d_in_1, r_e_in_1);
    // EvalMode: none

    // -- Output field setup
    // ---- Output field 0
    CeedScalar r_q_out_0[num_comp_out_0*Q_1d];
    for (CeedInt i = 0; i < num_comp_out_0*Q_1d; i++) {
      r_q_out_0[i] = 0.0;
    }

    // Note: Using planes of 3D elements
    #pragma unroll
    for (CeedInt q = 0; q < Q_1d; q++) {
      // -- Input fields
      // ---- Input field 0
      // EvalMode: gradient
      CeedScalar r_s_in_0[num_comp_in_0*dim];
      GradColloSlice3d<num_comp_in_0, Q_1d>(data, q, r_q_in_0, s_G_in_0, r_s_in_0);
      // ---- Input field 1
      // EvalMode: none
      CeedScalar r_s_in_1[num_comp_in_1];
      // Strides: {1, 884736, 216}
      ReadEVecSliceStrided3d<num_comp_in_1, Q_1d, 1, 884736, 216>(data, elem, q, d_in_1, r_s_in_1);

      // -- Output fields
      // ---- Output field 0
      CeedScalar r_s_out_0[num_comp_out_0*dim];

      // -- QFunction inputs and outputs
      // ---- Inputs
      CeedScalar *inputs[2];
      // ------ Input field 0
      inputs[0] = r_s_in_0;
      // ------ Input field 1
      inputs[1] = r_s_in_1;
      // ---- Outputs
      CeedScalar *outputs[1];
      // ------ Output field 0
      outputs[0] = r_s_out_0;

      // -- Apply QFunction
      Poisson3DApply(ctx, 1, inputs, outputs);

      // -- Output fields
      // ---- Output field 0
      // EvalMode: gradient
      GradColloSliceTranspose3d<num_comp_out_0, Q_1d>(data, q, r_s_out_0, s_G_out_0, r_q_out_0);
    }

    // -- Output field basis action and restrictions
    // ---- Output field 0
    // EvalMode: gradient
    CeedScalar *r_e_out_0 = r_e_scratch;
    InterpTransposeTensor3d<num_comp_out_0, P_1d_out_0, Q_1d>(data, r_q_out_0, s_B_out_0, r_e_out_0);
    const CeedInt l_size_out_0 = 274625;
    // CompStride: 274625
    WriteLVecStandard3d<num_comp_out_0, 274625, P_1d_out_0>(data, l_size_out_0, elem, indices.outputs[0], r_e_out_0, d_out_0);
  }
}
// -----------------------------------------------------------------------------

jeremylt · 2025-02-07T19:41:53Z

backends/cuda-gen/ceed-cuda-gen-operator-build.cpp

+  bool         use_previous_field = field_reuse[0] != -1;
+  bool         reuse_input        = field_reuse[1];
+  CeedInt      reuse_field        = field_reuse[0];
+  CeedEvalMode reuse_mode         = (CeedEvalMode)field_reuse[2];


It would probably be clearer to make a small struct to do this. In progress

jeremylt added GPU CUDA 0-WIP HIP labels Feb 6, 2025

jeremylt self-assigned this Feb 6, 2025

jeremylt force-pushed the jeremy/gen-matload-no-dupe branch from 553c07e to a4c0ed8 Compare February 7, 2025 17:46

jeremylt added 1-In Review and removed 0-WIP labels Feb 7, 2025

jeremylt commented Feb 7, 2025

View reviewed changes

jeremylt added 2 commits February 7, 2025 16:11

cuda - remove duplicate mats in gen

0a2a649

hip - remove duplicate mats in gen

9ee499e

jeremylt force-pushed the jeremy/gen-matload-no-dupe branch 2 times, most recently from b6fbfc6 to 9f620e6 Compare February 7, 2025 23:34

gpu - use struct over array for clarity

45a787f

jeremylt force-pushed the jeremy/gen-matload-no-dupe branch from 9f620e6 to 45a787f Compare February 7, 2025 23:37

jeremylt merged commit c4ab032 into main Feb 7, 2025
27 of 28 checks passed

jeremylt deleted the jeremy/gen-matload-no-dupe branch February 7, 2025 23:40

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

cuda - remove duplicate mats in gen #1739

cuda - remove duplicate mats in gen #1739

jeremylt commented Feb 6, 2025

jeremylt Feb 7, 2025

cuda - remove duplicate mats in gen #1739

cuda - remove duplicate mats in gen #1739

Conversation

jeremylt commented Feb 6, 2025

jeremylt Feb 7, 2025

Choose a reason for hiding this comment