Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move TMem tests to a separate file #4119

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open

Move TMem tests to a separate file #4119

wants to merge 4 commits into from

Conversation

zasdfgbnm
Copy link
Collaborator

No description provided.

Copy link

github-actions bot commented Mar 20, 2025

Review updated until commit 72c2d0f

Description

  • Moved TMem tests to a separate file test_tmem.cpp

  • Updated CMakeLists.txt to include the new test file


Changes walkthrough 📝

Relevant files
Tests
test_memory.cpp
Remove TMem tests from test_memory.cpp                                     

tests/cpp/test_memory.cpp

  • Removed TMem test cases from test_memory.cpp
+0/-257 
test_tmem.cpp
Add TMem tests to test_tmem.cpp                                                   

tests/cpp/test_tmem.cpp

  • Added TMem test cases to test_tmem.cpp
+279/-0 
Configuration changes
CMakeLists.txt
Include test_tmem.cpp in CMakeLists.txt                                   

CMakeLists.txt

  • Included test_tmem.cpp in the JIT test sources
+1/-0     

PR Reviewer Guide 🔍

Here are some key observations to aid the review process:

🧪 PR contains tests
⚡ Recommended focus areas for review

Code Duplication

The tests for TMem are duplicated in both test_memory.cpp and test_tmem.cpp. Ensure that the tests are not duplicated and are correctly organized in the new file.

// clang-format off
/*
 * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
 * All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */
// clang-format on
#include <gmock/gmock-matchers.h>
#include <gtest/gtest.h>

#include <fusion.h>
#include <ops/alias.h>
#include <ops/arith.h>
#include <scheduler/tools/inlining.h>
#include <type.h>

#include <tests/cpp/utils.h>
#include <tests/cpp/validator.h>

namespace nvfuser {

// Tensor memory tests
using TMemTest = BlackwellBase;

TEST_F(TMemTest, GmemRegTMemRegGmemCopy) {
  Fusion fusion;
  FusionGuard fg(&fusion);

  auto tv0 = makeSymbolicTensor(1);
  fusion.addInput(tv0);
  auto tv1 = set(tv0); // register
  auto tv2 = set(tv1); // tmem
  auto tv3 = set(tv2); // register
  auto tv4 = set(tv3); // gmem
  fusion.addOutput(tv4);

  tv2->setMemoryType(MemoryType::Tensor);
  tv2->definition()->as<LoadStoreOp>()->setOpType(LoadStoreOpType::StTMem);
  tv3->definition()->as<LoadStoreOp>()->setOpType(LoadStoreOpType::LdTMem);

  tv4->split(0, 32);

  TransformPropagator propagator(tv4);
  MaxLogicalDomainInfoSpanningTree(tv4).traverse(&propagator);

  tv4->axis(0)->parallelize(ParallelType::BIDx);
  tv4->axis(1)->parallelize(ParallelType::TIDx);

  scheduler_utils::parallelizeAllLike(tv4, {tv1, tv2, tv3});

  tv2->setAllocationDomain(tv2->getLoopDomain(), true);
  tv2->setTMemDimSepPos(-1);

  inlineMost();

  KernelExecutor ke;
  ke.compile(&fusion);
  auto t0 = at::randn(
      {12800}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0));
  auto cg_outputs = ke.run({t0});
  testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
}

void testTMemAddKernel(bool same_region) {
  Fusion fusion;
  FusionGuard fg(&fusion);

  auto tv0 = makeSymbolicTensor(1);
  fusion.addInput(tv0);
  auto tv1 = set(tv0); // register
  auto tv2 = set(tv1); // tmem
  auto tv3 = set(tv2); // register
  auto tv4 = makeSymbolicTensor(1);
  fusion.addInput(tv4);
  auto tv5 = set(tv4); // register
  auto tv6 = set(tv5); // tmem
  auto tv7 = set(tv6); // register
  auto tv8 = add(tv3, tv7); // register
  auto tv9 = set(tv8); // gmem
  fusion.addOutput(tv9);

  if (same_region) {
    using Region = std::vector<TensorView*>;
    Region region1{tv2, tv6};
    std::vector<Region> regions{region1};
    fusion.manage("tmem_regions", regions);
  }

  tv2->setMemoryType(MemoryType::Tensor);
  tv2->definition()->as<LoadStoreOp>()->setOpType(LoadStoreOpType::StTMem);
  tv3->definition()->as<LoadStoreOp>()->setOpType(LoadStoreOpType::LdTMem);

  tv6->setMemoryType(MemoryType::Tensor);
  tv6->definition()->as<LoadStoreOp>()->setOpType(LoadStoreOpType::StTMem);
  tv7->definition()->as<LoadStoreOp>()->setOpType(LoadStoreOpType::LdTMem);

  tv9->split(0, 32);

  TransformPropagator propagator(tv9);
  MaxLogicalDomainInfoSpanningTree(tv9).traverse(&propagator);

  tv9->axis(0)->parallelize(ParallelType::BIDx);
  tv9->axis(1)->parallelize(ParallelType::TIDx);

  scheduler_utils::parallelizeAllLike(tv9);

  for (auto tv : {tv2, tv6}) {
    tv->setAllocationDomain(tv->getLoopDomain(), true);
    tv->setTMemDimSepPos(-1);
  }

  inlineMost();

  KernelExecutor ke;

  // check number of tcgen05.alloc calls
  ke.registerLoweringHook([same_region](GpuLower* lower) {
    auto check_pass = [same_region](const std::vector<Expr*>& exprs) {
      int64_t num_allocs =
          std::count_if(exprs.begin(), exprs.end(), [](Expr* expr) {
            std::string str = expr->toString();
            return str.find("tcgen05.alloc") != std::string::npos;
          });
      EXPECT_EQ(num_allocs, same_region ? 1 : 2);
      int64_t num_deallocs = 0;
      for (auto expr : exprs) {
        std::string str = expr->toString();
        std::string sub = "tcgen05.dealloc";
        // count number of sub in str
        size_t pos = 0;
        while ((pos = str.find(sub, pos)) != std::string::npos) {
          ++num_deallocs;
          pos += sub.length();
        }
      }
      EXPECT_EQ(num_deallocs, same_region ? 1 : 2);
      return exprs;
    };
    lower->passes().push_back({"Check result", check_pass});
  });

  ke.compile(&fusion);
  auto t0 = at::randn(
      {12800}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0));
  auto t1 = at::randn(
      {12800}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0));
  auto cg_outputs = ke.run({t0, t1});
  testValidate(&fusion, cg_outputs, {t0, t1}, {t0 + t1}, __LINE__, __FILE__);
}

TEST_F(TMemTest, AddKernelMultipleRegions) {
  testTMemAddKernel(false);
}

TEST_F(TMemTest, AddKernelSameRegion) {
  testTMemAddKernel(true);
}

using TMemTestCompileOnly = NVFuserTest;

TEST_F(TMemTestCompileOnly, SetTMemDimSepPosNonTMem) {
  Fusion fusion;
  FusionGuard fg(&fusion);

  auto tv0 = makeContigConcreteTensor({2, 33});
  fusion.addInput(tv0);
  auto tv1 = set(tv0);
  fusion.addOutput(tv1);

  EXPECT_THAT(
      [&]() { tv1->setTMemDimSepPos(-1); },
      ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
          "TMem dimension separator is only supported for tensor memory")));
}

// Test that we are checking the stride of the "outer parallel types".
// If in a kernel, the parallel dimension map is [TIDy, TIDx] = [2, 33],
// But in the TMem load/store's loop domain, Ix (the ID parallelized on TIDx)
// have extent 32. Then we will generate code like:
//   if (threadIdx.x < 32) {
//     tmem::load
//   }
// For threadIdx.y == 0, it is correct. But for threadIdx.y == 1, it is wrong
// because we are using the thread id 33-65 for the load, which is not a warp.
TEST_F(TMemTestCompileOnly, WrongStride) {
  Fusion fusion;
  FusionGuard fg(&fusion);

  auto tv0 = makeContigConcreteTensor({2, 33});
  fusion.addInput(tv0);
  auto tv1 = set(tv0); // gmem
  auto tv2 = set(tv1); // register
  auto tv3 = set(tv2); // tmem
  auto tv4 = set(tv3); // register
  auto tv5 = set(tv4); // gmem
  fusion.addOutput(tv5);

  tv1->setMemoryType(MemoryType::Global);
  tv3->setMemoryType(MemoryType::Tensor);
  tv3->definition()->as<LoadStoreOp>()->setOpType(LoadStoreOpType::StTMem);
  tv4->definition()->as<LoadStoreOp>()->setOpType(LoadStoreOpType::LdTMem);

  // [TIDy{2}, TIDx{33}]
  tv1->axis(0)->parallelize(ParallelType::TIDy);
  tv1->axis(1)->parallelize(ParallelType::TIDx);

  // [TIDy{2}, Serial{2}, TIDx{32}]
  for (auto tv : {tv2, tv3, tv4, tv5}) {
    tv->split(1, 32);
    tv->axis(0)->parallelize(ParallelType::TIDy);
    tv->axis(-1)->parallelize(ParallelType::TIDx);
  }

  tv3->setAllocationDomain(tv3->getLoopDomain(), true);
  tv3->setTMemDimSepPos(-1);

  inlineMost();

  KernelExecutor ke;

  EXPECT_THAT(
      [&]() { ke.compile(&fusion); },
      ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
          "Invalid data access pattern in TMem load/store: "
          "Outer parallel types' strides must be a multiple of 32.")));
}

// This test is a variant of the WrongStride test, but this test is valid.
// Test a case where the parallel types are not exact. The parallel dimension
// map is [TIDy, TIDx] = [2, 33], but in the TMem load/store's loop domain,
// we have Iy{1}, Ix{32}. the generated code will be like:
//   if (threadIdx.x < 32 && threadIdx.y < 1) {
//     tmem::load
//   }
// This is valid because we are using a whole warp for the load.
TEST_F(TMemTest, InexactParallelType) {
  Fusion fusion;
  FusionGuard fg(&fusion);

  auto tv0 = makeContigConcreteTensor({2, 33});
  fusion.addInput(tv0);
  auto tv1 = set(tv0); // gmem
  auto tv2 = set(tv1); // register
  auto tv3 = set(tv2); // tmem
  auto tv4 = set(tv3); // register
  auto tv5 = set(tv4); // gmem
  fusion.addOutput(tv5);

  tv1->setMemoryType(MemoryType::Global);
  tv3->setMemoryType(MemoryType::Tensor);
  tv3->definition()->as<LoadStoreOp>()->setOpType(LoadStoreOpType::StTMem);
  tv4->definition()->as<LoadStoreOp>()->setOpType(LoadStoreOpType::LdTMem);

  // [TIDy{2}, TIDx{33}]
  tv1->axis(0)->parallelize(ParallelType::TIDy);
  tv1->axis(1)->parallelize(ParallelType::TIDx);

  // [Serial{2}, TIDy{1}, Serial{2}, TIDx{32}]
  for (auto tv : {tv2, tv3, tv4, tv5}) {
    tv->split(1, 32);
    tv->split(0, 1);
    tv->axis(1)->parallelize(ParallelType::TIDy);
    tv->axis(-1)->parallelize(ParallelType::TIDx);
  }

  tv3->setAllocationDomain(tv3->getLoopDomain(), true);
  tv3->setTMemDimSepPos(-1);

  inlineMost();

  KernelExecutor ke;
  ke.compile(&fusion);
  auto t0 = at::randn(
      {2, 33}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0));
  auto cg_outputs = ke.run({t0});
  testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
}

} // namespace nvfuser

@zasdfgbnm zasdfgbnm changed the title Refactor tmem Move TMem tests to a separate file Mar 24, 2025
@zasdfgbnm
Copy link
Collaborator Author

!test

@zasdfgbnm zasdfgbnm marked this pull request as ready for review March 24, 2025 23:34
@zasdfgbnm zasdfgbnm requested a review from rdspring1 March 24, 2025 23:34
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

1 participant