Move TMem tests to a separate file #4119

zasdfgbnm · 2025-03-20T23:14:45Z

No description provided.

github-actions · 2025-03-20T23:15:25Z

Review updated until commit 72c2d0f

Description

Moved TMem tests to a separate file test_tmem.cpp
Updated CMakeLists.txt to include the new test file

Changes walkthrough 📝

Relevant files

Tests

test_memory.cpp `Remove TMem tests from test_memory.cpp` tests/cpp/test_memory.cpp Removed TMem test cases from `test_memory.cpp`	+0/-257
test_tmem.cpp `Add TMem tests to test_tmem.cpp` tests/cpp/test_tmem.cpp Added TMem test cases to `test_tmem.cpp`	+279/-0

Configuration changes

CMakeLists.txt `Include test_tmem.cpp in CMakeLists.txt` CMakeLists.txt Included `test_tmem.cpp` in the JIT test sources	+1/-0

PR Reviewer Guide 🔍

Here are some key observations to aid the review process:

🧪 PR contains tests

⚡ Recommended focus areas for review

Code Duplication

The tests for TMem are duplicated in both test_memory.cpp and test_tmem.cpp. Ensure that the tests are not duplicated and are correctly organized in the new file.

// clang-format off
/*
 * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
 * All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */
// clang-format on
#include <gmock/gmock-matchers.h>
#include <gtest/gtest.h>

#include <fusion.h>
#include <ops/alias.h>
#include <ops/arith.h>
#include <scheduler/tools/inlining.h>
#include <type.h>

#include <tests/cpp/utils.h>
#include <tests/cpp/validator.h>

namespace nvfuser {

// Tensor memory tests
using TMemTest = BlackwellBase;

TEST_F(TMemTest, GmemRegTMemRegGmemCopy) {
  Fusion fusion;
  FusionGuard fg(&fusion);

  auto tv0 = makeSymbolicTensor(1);
  fusion.addInput(tv0);
  auto tv1 = set(tv0); // register
  auto tv2 = set(tv1); // tmem
  auto tv3 = set(tv2); // register
  auto tv4 = set(tv3); // gmem
  fusion.addOutput(tv4);

  tv2->setMemoryType(MemoryType::Tensor);
  tv2->definition()->as<LoadStoreOp>()->setOpType(LoadStoreOpType::StTMem);
  tv3->definition()->as<LoadStoreOp>()->setOpType(LoadStoreOpType::LdTMem);

  tv4->split(0, 32);

  TransformPropagator propagator(tv4);
  MaxLogicalDomainInfoSpanningTree(tv4).traverse(&propagator);

  tv4->axis(0)->parallelize(ParallelType::BIDx);
  tv4->axis(1)->parallelize(ParallelType::TIDx);

  scheduler_utils::parallelizeAllLike(tv4, {tv1, tv2, tv3});

  tv2->setAllocationDomain(tv2->getLoopDomain(), true);
  tv2->setTMemDimSepPos(-1);

  inlineMost();

  KernelExecutor ke;
  ke.compile(&fusion);
  auto t0 = at::randn(
      {12800}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0));
  auto cg_outputs = ke.run({t0});
  testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
}

void testTMemAddKernel(bool same_region) {
  Fusion fusion;
  FusionGuard fg(&fusion);

  auto tv0 = makeSymbolicTensor(1);
  fusion.addInput(tv0);
  auto tv1 = set(tv0); // register
  auto tv2 = set(tv1); // tmem
  auto tv3 = set(tv2); // register
  auto tv4 = makeSymbolicTensor(1);
  fusion.addInput(tv4);
  auto tv5 = set(tv4); // register
  auto tv6 = set(tv5); // tmem
  auto tv7 = set(tv6); // register
  auto tv8 = add(tv3, tv7); // register
  auto tv9 = set(tv8); // gmem
  fusion.addOutput(tv9);

  if (same_region) {
    using Region = std::vector<TensorView*>;
    Region region1{tv2, tv6};
    std::vector<Region> regions{region1};
    fusion.manage("tmem_regions", regions);
  }

  tv2->setMemoryType(MemoryType::Tensor);
  tv2->definition()->as<LoadStoreOp>()->setOpType(LoadStoreOpType::StTMem);
  tv3->definition()->as<LoadStoreOp>()->setOpType(LoadStoreOpType::LdTMem);

  tv6->setMemoryType(MemoryType::Tensor);
  tv6->definition()->as<LoadStoreOp>()->setOpType(LoadStoreOpType::StTMem);
  tv7->definition()->as<LoadStoreOp>()->setOpType(LoadStoreOpType::LdTMem);

  tv9->split(0, 32);

  TransformPropagator propagator(tv9);
  MaxLogicalDomainInfoSpanningTree(tv9).traverse(&propagator);

  tv9->axis(0)->parallelize(ParallelType::BIDx);
  tv9->axis(1)->parallelize(ParallelType::TIDx);

  scheduler_utils::parallelizeAllLike(tv9);

  for (auto tv : {tv2, tv6}) {
    tv->setAllocationDomain(tv->getLoopDomain(), true);
    tv->setTMemDimSepPos(-1);
  }

  inlineMost();

  KernelExecutor ke;

  // check number of tcgen05.alloc calls
  ke.registerLoweringHook([same_region](GpuLower* lower) {
    auto check_pass = [same_region](const std::vector<Expr*>& exprs) {
      int64_t num_allocs =
          std::count_if(exprs.begin(), exprs.end(), [](Expr* expr) {
            std::string str = expr->toString();
            return str.find("tcgen05.alloc") != std::string::npos;
          });
      EXPECT_EQ(num_allocs, same_region ? 1 : 2);
      int64_t num_deallocs = 0;
      for (auto expr : exprs) {
        std::string str = expr->toString();
        std::string sub = "tcgen05.dealloc";
        // count number of sub in str
        size_t pos = 0;
        while ((pos = str.find(sub, pos)) != std::string::npos) {
          ++num_deallocs;
          pos += sub.length();
        }
      }
      EXPECT_EQ(num_deallocs, same_region ? 1 : 2);
      return exprs;
    };
    lower->passes().push_back({"Check result", check_pass});
  });

  ke.compile(&fusion);
  auto t0 = at::randn(
      {12800}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0));
  auto t1 = at::randn(
      {12800}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0));
  auto cg_outputs = ke.run({t0, t1});
  testValidate(&fusion, cg_outputs, {t0, t1}, {t0 + t1}, __LINE__, __FILE__);
}

TEST_F(TMemTest, AddKernelMultipleRegions) {
  testTMemAddKernel(false);
}

TEST_F(TMemTest, AddKernelSameRegion) {
  testTMemAddKernel(true);
}

using TMemTestCompileOnly = NVFuserTest;

TEST_F(TMemTestCompileOnly, SetTMemDimSepPosNonTMem) {
  Fusion fusion;
  FusionGuard fg(&fusion);

  auto tv0 = makeContigConcreteTensor({2, 33});
  fusion.addInput(tv0);
  auto tv1 = set(tv0);
  fusion.addOutput(tv1);

  EXPECT_THAT(
      [&]() { tv1->setTMemDimSepPos(-1); },
      ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
          "TMem dimension separator is only supported for tensor memory")));
}

// Test that we are checking the stride of the "outer parallel types".
// If in a kernel, the parallel dimension map is [TIDy, TIDx] = [2, 33],
// But in the TMem load/store's loop domain, Ix (the ID parallelized on TIDx)
// have extent 32. Then we will generate code like:
//   if (threadIdx.x < 32) {
//     tmem::load
//   }
// For threadIdx.y == 0, it is correct. But for threadIdx.y == 1, it is wrong
// because we are using the thread id 33-65 for the load, which is not a warp.
TEST_F(TMemTestCompileOnly, WrongStride) {
  Fusion fusion;
  FusionGuard fg(&fusion);

  auto tv0 = makeContigConcreteTensor({2, 33});
  fusion.addInput(tv0);
  auto tv1 = set(tv0); // gmem
  auto tv2 = set(tv1); // register
  auto tv3 = set(tv2); // tmem
  auto tv4 = set(tv3); // register
  auto tv5 = set(tv4); // gmem
  fusion.addOutput(tv5);

  tv1->setMemoryType(MemoryType::Global);
  tv3->setMemoryType(MemoryType::Tensor);
  tv3->definition()->as<LoadStoreOp>()->setOpType(LoadStoreOpType::StTMem);
  tv4->definition()->as<LoadStoreOp>()->setOpType(LoadStoreOpType::LdTMem);

  // [TIDy{2}, TIDx{33}]
  tv1->axis(0)->parallelize(ParallelType::TIDy);
  tv1->axis(1)->parallelize(ParallelType::TIDx);

  // [TIDy{2}, Serial{2}, TIDx{32}]
  for (auto tv : {tv2, tv3, tv4, tv5}) {
    tv->split(1, 32);
    tv->axis(0)->parallelize(ParallelType::TIDy);
    tv->axis(-1)->parallelize(ParallelType::TIDx);
  }

  tv3->setAllocationDomain(tv3->getLoopDomain(), true);
  tv3->setTMemDimSepPos(-1);

  inlineMost();

  KernelExecutor ke;

  EXPECT_THAT(
      [&]() { ke.compile(&fusion); },
      ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
          "Invalid data access pattern in TMem load/store: "
          "Outer parallel types' strides must be a multiple of 32.")));
}

// This test is a variant of the WrongStride test, but this test is valid.
// Test a case where the parallel types are not exact. The parallel dimension
// map is [TIDy, TIDx] = [2, 33], but in the TMem load/store's loop domain,
// we have Iy{1}, Ix{32}. the generated code will be like:
//   if (threadIdx.x < 32 && threadIdx.y < 1) {
//     tmem::load
//   }
// This is valid because we are using a whole warp for the load.
TEST_F(TMemTest, InexactParallelType) {
  Fusion fusion;
  FusionGuard fg(&fusion);

  auto tv0 = makeContigConcreteTensor({2, 33});
  fusion.addInput(tv0);
  auto tv1 = set(tv0); // gmem
  auto tv2 = set(tv1); // register
  auto tv3 = set(tv2); // tmem
  auto tv4 = set(tv3); // register
  auto tv5 = set(tv4); // gmem
  fusion.addOutput(tv5);

  tv1->setMemoryType(MemoryType::Global);
  tv3->setMemoryType(MemoryType::Tensor);
  tv3->definition()->as<LoadStoreOp>()->setOpType(LoadStoreOpType::StTMem);
  tv4->definition()->as<LoadStoreOp>()->setOpType(LoadStoreOpType::LdTMem);

  // [TIDy{2}, TIDx{33}]
  tv1->axis(0)->parallelize(ParallelType::TIDy);
  tv1->axis(1)->parallelize(ParallelType::TIDx);

  // [Serial{2}, TIDy{1}, Serial{2}, TIDx{32}]
  for (auto tv : {tv2, tv3, tv4, tv5}) {
    tv->split(1, 32);
    tv->split(0, 1);
    tv->axis(1)->parallelize(ParallelType::TIDy);
    tv->axis(-1)->parallelize(ParallelType::TIDx);
  }

  tv3->setAllocationDomain(tv3->getLoopDomain(), true);
  tv3->setTMemDimSepPos(-1);

  inlineMost();

  KernelExecutor ke;
  ke.compile(&fusion);
  auto t0 = at::randn(
      {2, 33}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0));
  auto cg_outputs = ke.run({t0});
  testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
}

} // namespace nvfuser

zasdfgbnm · 2025-03-24T19:27:48Z

!test

zasdfgbnm added 2 commits March 20, 2025 16:12

Move TMem test to separate file

073cdb6

save

8d8963d

zasdfgbnm added 2 commits March 24, 2025 12:26

fix

4472a5b

new line

72c2d0f

zasdfgbnm changed the title ~~Refactor tmem~~ Move TMem tests to a separate file Mar 24, 2025

zasdfgbnm marked this pull request as ready for review March 24, 2025 23:34

zasdfgbnm requested a review from rdspring1 March 24, 2025 23:34

rdspring1 approved these changes Mar 26, 2025

View reviewed changes

zasdfgbnm merged commit dd268cc into main Mar 26, 2025
52 of 53 checks passed

zasdfgbnm deleted the refactor-tmem branch March 26, 2025 05:06

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Move TMem tests to a separate file #4119

Move TMem tests to a separate file #4119

zasdfgbnm commented Mar 20, 2025

github-actions bot commented Mar 20, 2025 •

edited

Loading

zasdfgbnm commented Mar 24, 2025

Move TMem tests to a separate file #4119

Move TMem tests to a separate file #4119

Conversation

zasdfgbnm commented Mar 20, 2025

github-actions bot commented Mar 20, 2025 • edited Loading

Description

Changes walkthrough 📝

PR Reviewer Guide 🔍

zasdfgbnm commented Mar 24, 2025

github-actions bot commented Mar 20, 2025 •

edited

Loading