FP4 support in pointwise scheduler #4666

zasdfgbnm · 2025-06-24T20:55:11Z

No description provided.

github-actions · 2025-06-24T20:55:53Z

Description

Added FP4 support in pointwise scheduler
Renamed test classes and functions for clarity
Introduced new test cases for scheduler

Changes walkthrough 📝

Relevant files

Enhancement

test_gpu1.cpp `Enhance FP4 support and add scheduler tests` tests/cpp/test_gpu1.cpp Renamed `Float4E2m1TestParams` to `Float4E2m1ManualScheduleTestParams` Renamed `Float4E2m1TestAllArch` to `Float4E2m1ManualScheduleTestAllArch` Renamed `CopyKernelManualSchedule` to `CopyKernel` Renamed `fp4E2m1Name` to `fp4E2m1ManualScheduleName` Added new test class `Float4E2m1SchedulerTestAllArch` Added new test function `CopyKernel` for scheduler Added new test cases with varying sizes and dynamic shapes	+67/-8

PR Reviewer Guide 🔍

Here are some key observations to aid the review process:

🧪 PR contains tests

⚡ Recommended focus areas for review

Naming Consistency

The naming of test classes and functions is inconsistent. For example, Float4E2m1ManualScheduleTestAllArch and Float4E2m1SchedulerTestAllArch are introduced, but the old class Float4E2m1TestAllArch is still present. Ensure that the naming is consistent and that the old classes/functions are either updated or removed if they are no longer needed.

using Float4E2m1ManualScheduleTestParams = std::tuple<int64_t, bool>;

class Float4E2m1ManualScheduleTestAllArch
    : public NVFuserFixtureParamTest<Float4E2m1ManualScheduleTestParams> {
 protected:
  int64_t vectorize_factor;
  bool dynamic_shape;
  void SetUp() {
    std::tie(vectorize_factor, dynamic_shape) = GetParam();
  }
};

TEST_P(Float4E2m1ManualScheduleTestAllArch, CopyKernel) {
  Fusion fusion;
  FusionGuard fg(&fusion);

  TensorView* tv0 = dynamic_shape
      ? makeContigTensor(1, DataType::Float4_e2m1)
      : makeContigConcreteTensor({2048}, DataType::Float4_e2m1);
  fusion.addInput(tv0);
  TensorView* tv1 = set(tv0);
  fusion.addOutput(tv1);

  tv1->split(0, vectorize_factor);
  tv1->axis(0)->parallelize(ParallelType::TIDx);
  tv1->axis(1)->parallelize(ParallelType::Vectorize);

  inlineMost();

  auto options = at::TensorOptions().dtype(torch::kUInt8).device(at::kCUDA, 0);
  at::Tensor input = at::randint(0, 256, {1024}, options);

  KernelExecutor ke;
  if (vectorize_factor == 1) {
    EXPECT_THAT(
        [&]() { ke.compile(&fusion, {input}); },
        testing::ThrowsMessage<nvfuser::nvfError>(testing::HasSubstr(
            "Tried to vectorize a dim resulting in a word size of 4 bits, "
            "however, vector sizes starting from and including 8 bits upto and "
            "including 128 bits are supported.")));
  } else {
    ke.compile(&fusion, {input});
    auto outputs = ke.run({input});
    EXPECT_TRUE(outputs[0].as<at::Tensor>().equal(input));
  }
}

std::string fp4E2m1ManualScheduleName(
    const testing::TestParamInfo<Float4E2m1ManualScheduleTestParams>& info) {
  const auto& [vectorize_factor, dynamic_shape] = info.param;
  return "Vectorize" + std::to_string(vectorize_factor) + "_DynamicShape" +
      std::to_string(dynamic_shape);
}

INSTANTIATE_TEST_SUITE_P(
    ,
    Float4E2m1ManualScheduleTestAllArch,
    testing::Combine(
        testing::Values(1, 2, 4, 8, 16, 32),
        testing::Values(false, true)),
    fp4E2m1ManualScheduleName);

using Float4E2m1SchedulerTestParams = std::tuple<int64_t, bool>;

class Float4E2m1SchedulerTestAllArch
    : public NVFuserFixtureParamTest<Float4E2m1SchedulerTestParams> {
 protected:
  int64_t size;
  bool dynamic_shape;
  void SetUp() {
    std::tie(size, dynamic_shape) = GetParam();
  }
};

TEST_P(Float4E2m1SchedulerTestAllArch, CopyKernel) {
  auto fusion_ptr = std::make_unique<Fusion>();
  auto& fusion = *fusion_ptr;
  FusionGuard fg(fusion_ptr.get());

  TensorView* tv0 = dynamic_shape
      ? makeContigTensor(1, DataType::Float4_e2m1)
      : makeContigConcreteTensor({size}, DataType::Float4_e2m1);
  fusion.addInput(tv0);
  TensorView* tv1 = set(tv0);
  fusion.addOutput(tv1);

  auto options = at::TensorOptions().dtype(torch::kUInt8).device(at::kCUDA, 0);
  at::Tensor input = at::randint(0, 256, {size / 2}, options);

  FusionExecutorCache executor_cache(std::move(fusion_ptr));
  auto outputs = executor_cache.runFusionWithInputs({input});

  EXPECT_TRUE(outputs[0].as<at::Tensor>().equal(input));
}

std::string fp4E2m1SchedulerName(
    const testing::TestParamInfo<Float4E2m1SchedulerTestParams>& info) {
  const auto& [size, dynamic_shape] = info.param;
  return "Size" + std::to_string(size) + "_DynamicShape" +
      std::to_string(dynamic_shape);
}

INSTANTIATE_TEST_SUITE_P(
    ,
    Float4E2m1SchedulerTestAllArch,
    testing::Combine(
        testing::Values(
            2,
            64,
            64 + 2,
            1024,
            1024 + 2,
            64 * 1024,
            64 * 1024 + 2,
            1024 * 1024,
            1024 * 1024 + 2,
            64 * 1024 * 1024,
            64 * 1024 * 1024 + 2),
        testing::Values(false, true)),
    fp4E2m1SchedulerName);

Test Coverage

The new tests cover a wide range of vectorize factors and dynamic shapes, but it would be beneficial to ensure that the test cases are representative of real-world scenarios and that edge cases are also considered.

TEST_P(Float4E2m1ManualScheduleTestAllArch, CopyKernel) {
  Fusion fusion;
  FusionGuard fg(&fusion);

  TensorView* tv0 = dynamic_shape
      ? makeContigTensor(1, DataType::Float4_e2m1)
      : makeContigConcreteTensor({2048}, DataType::Float4_e2m1);
  fusion.addInput(tv0);
  TensorView* tv1 = set(tv0);
  fusion.addOutput(tv1);

  tv1->split(0, vectorize_factor);
  tv1->axis(0)->parallelize(ParallelType::TIDx);
  tv1->axis(1)->parallelize(ParallelType::Vectorize);

  inlineMost();

  auto options = at::TensorOptions().dtype(torch::kUInt8).device(at::kCUDA, 0);
  at::Tensor input = at::randint(0, 256, {1024}, options);

  KernelExecutor ke;
  if (vectorize_factor == 1) {
    EXPECT_THAT(
        [&]() { ke.compile(&fusion, {input}); },
        testing::ThrowsMessage<nvfuser::nvfError>(testing::HasSubstr(
            "Tried to vectorize a dim resulting in a word size of 4 bits, "
            "however, vector sizes starting from and including 8 bits upto and "
            "including 128 bits are supported.")));
  } else {
    ke.compile(&fusion, {input});
    auto outputs = ke.run({input});
    EXPECT_TRUE(outputs[0].as<at::Tensor>().equal(input));
  }
}

std::string fp4E2m1ManualScheduleName(
    const testing::TestParamInfo<Float4E2m1ManualScheduleTestParams>& info) {
  const auto& [vectorize_factor, dynamic_shape] = info.param;
  return "Vectorize" + std::to_string(vectorize_factor) + "_DynamicShape" +
      std::to_string(dynamic_shape);
}

INSTANTIATE_TEST_SUITE_P(
    ,
    Float4E2m1ManualScheduleTestAllArch,
    testing::Combine(
        testing::Values(1, 2, 4, 8, 16, 32),
        testing::Values(false, true)),
    fp4E2m1ManualScheduleName);

using Float4E2m1SchedulerTestParams = std::tuple<int64_t, bool>;

class Float4E2m1SchedulerTestAllArch
    : public NVFuserFixtureParamTest<Float4E2m1SchedulerTestParams> {
 protected:
  int64_t size;
  bool dynamic_shape;
  void SetUp() {
    std::tie(size, dynamic_shape) = GetParam();
  }
};

TEST_P(Float4E2m1SchedulerTestAllArch, CopyKernel) {
  auto fusion_ptr = std::make_unique<Fusion>();
  auto& fusion = *fusion_ptr;
  FusionGuard fg(fusion_ptr.get());

  TensorView* tv0 = dynamic_shape
      ? makeContigTensor(1, DataType::Float4_e2m1)
      : makeContigConcreteTensor({size}, DataType::Float4_e2m1);
  fusion.addInput(tv0);
  TensorView* tv1 = set(tv0);
  fusion.addOutput(tv1);

  auto options = at::TensorOptions().dtype(torch::kUInt8).device(at::kCUDA, 0);
  at::Tensor input = at::randint(0, 256, {size / 2}, options);

  FusionExecutorCache executor_cache(std::move(fusion_ptr));
  auto outputs = executor_cache.runFusionWithInputs({input});

  EXPECT_TRUE(outputs[0].as<at::Tensor>().equal(input));
}

std::string fp4E2m1SchedulerName(
    const testing::TestParamInfo<Float4E2m1SchedulerTestParams>& info) {
  const auto& [size, dynamic_shape] = info.param;
  return "Size" + std::to_string(size) + "_DynamicShape" +
      std::to_string(dynamic_shape);
}

INSTANTIATE_TEST_SUITE_P(
    ,
    Float4E2m1SchedulerTestAllArch,
    testing::Combine(
        testing::Values(
            2,
            64,
            64 + 2,
            1024,
            1024 + 2,
            64 * 1024,
            64 * 1024 + 2,
            1024 * 1024,
            1024 * 1024 + 2,
            64 * 1024 * 1024,
            64 * 1024 * 1024 + 2),
        testing::Values(false, true)),
    fp4E2m1SchedulerName);

Performance Metrics

While new tests are added, there is no mention of performance metrics or benchmarks. It is crucial to include performance data to validate the effectiveness of the FP4 support in the pointwise scheduler.

TEST_P(Float4E2m1ManualScheduleTestAllArch, CopyKernel) {
  Fusion fusion;
  FusionGuard fg(&fusion);

  TensorView* tv0 = dynamic_shape
      ? makeContigTensor(1, DataType::Float4_e2m1)
      : makeContigConcreteTensor({2048}, DataType::Float4_e2m1);
  fusion.addInput(tv0);
  TensorView* tv1 = set(tv0);
  fusion.addOutput(tv1);

  tv1->split(0, vectorize_factor);
  tv1->axis(0)->parallelize(ParallelType::TIDx);
  tv1->axis(1)->parallelize(ParallelType::Vectorize);

  inlineMost();

  auto options = at::TensorOptions().dtype(torch::kUInt8).device(at::kCUDA, 0);
  at::Tensor input = at::randint(0, 256, {1024}, options);

  KernelExecutor ke;
  if (vectorize_factor == 1) {
    EXPECT_THAT(
        [&]() { ke.compile(&fusion, {input}); },
        testing::ThrowsMessage<nvfuser::nvfError>(testing::HasSubstr(
            "Tried to vectorize a dim resulting in a word size of 4 bits, "
            "however, vector sizes starting from and including 8 bits upto and "
            "including 128 bits are supported.")));
  } else {
    ke.compile(&fusion, {input});
    auto outputs = ke.run({input});
    EXPECT_TRUE(outputs[0].as<at::Tensor>().equal(input));
  }
}

std::string fp4E2m1ManualScheduleName(
    const testing::TestParamInfo<Float4E2m1ManualScheduleTestParams>& info) {
  const auto& [vectorize_factor, dynamic_shape] = info.param;
  return "Vectorize" + std::to_string(vectorize_factor) + "_DynamicShape" +
      std::to_string(dynamic_shape);
}

INSTANTIATE_TEST_SUITE_P(
    ,
    Float4E2m1ManualScheduleTestAllArch,
    testing::Combine(
        testing::Values(1, 2, 4, 8, 16, 32),
        testing::Values(false, true)),
    fp4E2m1ManualScheduleName);

using Float4E2m1SchedulerTestParams = std::tuple<int64_t, bool>;

class Float4E2m1SchedulerTestAllArch
    : public NVFuserFixtureParamTest<Float4E2m1SchedulerTestParams> {
 protected:
  int64_t size;
  bool dynamic_shape;
  void SetUp() {
    std::tie(size, dynamic_shape) = GetParam();
  }
};

TEST_P(Float4E2m1SchedulerTestAllArch, CopyKernel) {
  auto fusion_ptr = std::make_unique<Fusion>();
  auto& fusion = *fusion_ptr;
  FusionGuard fg(fusion_ptr.get());

  TensorView* tv0 = dynamic_shape
      ? makeContigTensor(1, DataType::Float4_e2m1)
      : makeContigConcreteTensor({size}, DataType::Float4_e2m1);
  fusion.addInput(tv0);
  TensorView* tv1 = set(tv0);
  fusion.addOutput(tv1);

  auto options = at::TensorOptions().dtype(torch::kUInt8).device(at::kCUDA, 0);
  at::Tensor input = at::randint(0, 256, {size / 2}, options);

  FusionExecutorCache executor_cache(std::move(fusion_ptr));
  auto outputs = executor_cache.runFusionWithInputs({input});

  EXPECT_TRUE(outputs[0].as<at::Tensor>().equal(input));
}

std::string fp4E2m1SchedulerName(
    const testing::TestParamInfo<Float4E2m1SchedulerTestParams>& info) {
  const auto& [size, dynamic_shape] = info.param;
  return "Size" + std::to_string(size) + "_DynamicShape" +
      std::to_string(dynamic_shape);
}

INSTANTIATE_TEST_SUITE_P(
    ,
    Float4E2m1SchedulerTestAllArch,
    testing::Combine(
        testing::Values(
            2,
            64,
            64 + 2,
            1024,
            1024 + 2,
            64 * 1024,
            64 * 1024 + 2,
            1024 * 1024,
            1024 * 1024 + 2,
            64 * 1024 * 1024,
            64 * 1024 * 1024 + 2),
        testing::Values(false, true)),
    fp4E2m1SchedulerName);

FP4 support in pointwise scheduler

751b3de

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

FP4 support in pointwise scheduler #4666

FP4 support in pointwise scheduler #4666

Uh oh!

zasdfgbnm commented Jun 24, 2025

Uh oh!

github-actions bot commented Jun 24, 2025

Uh oh!

Uh oh!

FP4 support in pointwise scheduler #4666

Are you sure you want to change the base?

FP4 support in pointwise scheduler #4666

Uh oh!

Conversation

zasdfgbnm commented Jun 24, 2025

Uh oh!

github-actions bot commented Jun 24, 2025

Description

Changes walkthrough 📝

PR Reviewer Guide 🔍

Uh oh!

Uh oh!