Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cuda - remove duplicate mats in gen #1739

Merged
merged 3 commits into from
Feb 7, 2025
Merged

Conversation

jeremylt
Copy link
Member

@jeremylt jeremylt commented Feb 6, 2025

Fixes #1737

This should let us run bigger elements for gen, and its just silly to do work we don't need to do.

Sample output:

// -----------------------------------------------------------------------------
// Operator Kernel
// 
// d_[in,out]_i:   CeedVector device array
// r_[in,out]_e_i: Element vector register
// r_[in,out]_q_i: Quadrature space vector register
// r_[in,out]_c_i: AtPoints Chebyshev coefficients register
// r_[in,out]_s_i: Quadrature space slice vector register
// 
// s_B_[in,out]_i: Interpolation matrix, shared memory
// s_G_[in,out]_i: Gradient matrix, shared memory
// -----------------------------------------------------------------------------
extern "C" __global__ void CeedKernelCudaGenOperator_Poisson3DApply(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda points) {
  const CeedScalar *d_in_0 = fields.inputs[0];
  const CeedScalar *d_in_1 = fields.inputs[1];
  CeedScalar *d_out_0 = fields.outputs[0];
  const CeedInt dim = 3;
  const CeedInt Q_1d = 6;
  extern __shared__ CeedScalar slice[];
  SharedData_Cuda data;
  data.t_id_x = threadIdx.x;
  data.t_id_y = threadIdx.y;
  data.t_id_z = threadIdx.z;
  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;
  data.slice = slice + data.t_id_z*T_1D*T_1D;

  // Input field constants and basis data
  // -- Input field 0
  const CeedInt P_1d_in_0 = 5;
  const CeedInt num_comp_in_0 = 1;
  // EvalMode: gradient
  __shared__ CeedScalar s_B_in_0[P_1d_in_0*Q_1d];
  LoadMatrix<P_1d_in_0, Q_1d>(data, B.inputs[0], s_B_in_0);
  __shared__ CeedScalar s_G_in_0[Q_1d*Q_1d];
  LoadMatrix<Q_1d, Q_1d>(data, G.inputs[0], s_G_in_0);
  // -- Input field 1
  const CeedInt P_1d_in_1 = 6;
  const CeedInt num_comp_in_1 = 6;
  // EvalMode: none

  // Output field constants and basis data
  // -- Output field 0
  const CeedInt P_1d_out_0 = 5;
  const CeedInt num_comp_out_0 = 1;
  // EvalMode: gradient
  CeedScalar *s_B_out_0 = s_B_in_0;
  CeedScalar *s_G_out_0 = s_G_in_0;

  // Element loop
  __syncthreads();
  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {
    // Scratch restriction buffer space
    CeedScalar r_e_scratch[1296];

    // -- Input field restrictions and basis actions
    // ---- Input field 0
    CeedScalar *r_e_in_0 = r_e_scratch;
    const CeedInt l_size_in_0 = 274625;
    // CompStride: 274625
    ReadLVecStandard3d<num_comp_in_0, 274625, P_1d_in_0>(data, l_size_in_0, elem, indices.inputs[0], d_in_0, r_e_in_0);
    // EvalMode: gradient
    CeedScalar r_q_in_0[num_comp_in_0*Q_1d];
    InterpTensor3d<num_comp_in_0, P_1d_in_0, Q_1d>(data, r_e_in_0, s_B_in_0, r_q_in_0);
    // ---- Input field 1
    CeedScalar r_e_in_1[num_comp_in_1*P_1d_in_1];
    // Strides: {1, 884736, 216}
    ReadLVecStrided3d<num_comp_in_1, P_1d_in_1, 1, 884736, 216>(data, elem, d_in_1, r_e_in_1);
    // EvalMode: none

    // -- Output field setup
    // ---- Output field 0
    CeedScalar r_q_out_0[num_comp_out_0*Q_1d];
    for (CeedInt i = 0; i < num_comp_out_0*Q_1d; i++) {
      r_q_out_0[i] = 0.0;
    }

    // Note: Using planes of 3D elements
    #pragma unroll
    for (CeedInt q = 0; q < Q_1d; q++) {
      // -- Input fields
      // ---- Input field 0
      // EvalMode: gradient
      CeedScalar r_s_in_0[num_comp_in_0*dim];
      GradColloSlice3d<num_comp_in_0, Q_1d>(data, q, r_q_in_0, s_G_in_0, r_s_in_0);
      // ---- Input field 1
      // EvalMode: none
      CeedScalar r_s_in_1[num_comp_in_1];
      // Strides: {1, 884736, 216}
      ReadEVecSliceStrided3d<num_comp_in_1, Q_1d, 1, 884736, 216>(data, elem, q, d_in_1, r_s_in_1);

      // -- Output fields
      // ---- Output field 0
      CeedScalar r_s_out_0[num_comp_out_0*dim];

      // -- QFunction inputs and outputs
      // ---- Inputs
      CeedScalar *inputs[2];
      // ------ Input field 0
      inputs[0] = r_s_in_0;
      // ------ Input field 1
      inputs[1] = r_s_in_1;
      // ---- Outputs
      CeedScalar *outputs[1];
      // ------ Output field 0
      outputs[0] = r_s_out_0;

      // -- Apply QFunction
      Poisson3DApply(ctx, 1, inputs, outputs);

      // -- Output fields
      // ---- Output field 0
      // EvalMode: gradient
      GradColloSliceTranspose3d<num_comp_out_0, Q_1d>(data, q, r_s_out_0, s_G_out_0, r_q_out_0);
    }

    // -- Output field basis action and restrictions
    // ---- Output field 0
    // EvalMode: gradient
    CeedScalar *r_e_out_0 = r_e_scratch;
    InterpTransposeTensor3d<num_comp_out_0, P_1d_out_0, Q_1d>(data, r_q_out_0, s_B_out_0, r_e_out_0);
    const CeedInt l_size_out_0 = 274625;
    // CompStride: 274625
    WriteLVecStandard3d<num_comp_out_0, 274625, P_1d_out_0>(data, l_size_out_0, elem, indices.outputs[0], r_e_out_0, d_out_0);
  }
}
// -----------------------------------------------------------------------------

Comment on lines 142 to 145
bool use_previous_field = field_reuse[0] != -1;
bool reuse_input = field_reuse[1];
CeedInt reuse_field = field_reuse[0];
CeedEvalMode reuse_mode = (CeedEvalMode)field_reuse[2];
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would probably be clearer to make a small struct to do this. In progress

@jeremylt jeremylt force-pushed the jeremy/gen-matload-no-dupe branch 2 times, most recently from b6fbfc6 to 9f620e6 Compare February 7, 2025 23:34
@jeremylt jeremylt force-pushed the jeremy/gen-matload-no-dupe branch from 9f620e6 to 45a787f Compare February 7, 2025 23:37
@jeremylt jeremylt merged commit c4ab032 into main Feb 7, 2025
27 of 28 checks passed
@jeremylt jeremylt deleted the jeremy/gen-matload-no-dupe branch February 7, 2025 23:40
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

De-duplicate Gen Basis Matrices
1 participant