Skip to content

FP4 support in pointwise scheduler #4666

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft

FP4 support in pointwise scheduler #4666

wants to merge 1 commit into from

Conversation

zasdfgbnm
Copy link
Collaborator

No description provided.

Copy link

Description

  • Added FP4 support in pointwise scheduler

  • Renamed test classes and functions for clarity

  • Introduced new test cases for scheduler


Changes walkthrough 📝

Relevant files
Enhancement
test_gpu1.cpp
Enhance FP4 support and add scheduler tests                           

tests/cpp/test_gpu1.cpp

  • Renamed Float4E2m1TestParams to Float4E2m1ManualScheduleTestParams
  • Renamed Float4E2m1TestAllArch to Float4E2m1ManualScheduleTestAllArch
  • Renamed CopyKernelManualSchedule to CopyKernel
  • Renamed fp4E2m1Name to fp4E2m1ManualScheduleName
  • Added new test class Float4E2m1SchedulerTestAllArch
  • Added new test function CopyKernel for scheduler
  • Added new test cases with varying sizes and dynamic shapes
  • +67/-8   

    PR Reviewer Guide 🔍

    Here are some key observations to aid the review process:

    🧪 PR contains tests
    ⚡ Recommended focus areas for review

    Naming Consistency

    The naming of test classes and functions is inconsistent. For example, Float4E2m1ManualScheduleTestAllArch and Float4E2m1SchedulerTestAllArch are introduced, but the old class Float4E2m1TestAllArch is still present. Ensure that the naming is consistent and that the old classes/functions are either updated or removed if they are no longer needed.

    using Float4E2m1ManualScheduleTestParams = std::tuple<int64_t, bool>;
    
    class Float4E2m1ManualScheduleTestAllArch
        : public NVFuserFixtureParamTest<Float4E2m1ManualScheduleTestParams> {
     protected:
      int64_t vectorize_factor;
      bool dynamic_shape;
      void SetUp() {
        std::tie(vectorize_factor, dynamic_shape) = GetParam();
      }
    };
    
    TEST_P(Float4E2m1ManualScheduleTestAllArch, CopyKernel) {
      Fusion fusion;
      FusionGuard fg(&fusion);
    
      TensorView* tv0 = dynamic_shape
          ? makeContigTensor(1, DataType::Float4_e2m1)
          : makeContigConcreteTensor({2048}, DataType::Float4_e2m1);
      fusion.addInput(tv0);
      TensorView* tv1 = set(tv0);
      fusion.addOutput(tv1);
    
      tv1->split(0, vectorize_factor);
      tv1->axis(0)->parallelize(ParallelType::TIDx);
      tv1->axis(1)->parallelize(ParallelType::Vectorize);
    
      inlineMost();
    
      auto options = at::TensorOptions().dtype(torch::kUInt8).device(at::kCUDA, 0);
      at::Tensor input = at::randint(0, 256, {1024}, options);
    
      KernelExecutor ke;
      if (vectorize_factor == 1) {
        EXPECT_THAT(
            [&]() { ke.compile(&fusion, {input}); },
            testing::ThrowsMessage<nvfuser::nvfError>(testing::HasSubstr(
                "Tried to vectorize a dim resulting in a word size of 4 bits, "
                "however, vector sizes starting from and including 8 bits upto and "
                "including 128 bits are supported.")));
      } else {
        ke.compile(&fusion, {input});
        auto outputs = ke.run({input});
        EXPECT_TRUE(outputs[0].as<at::Tensor>().equal(input));
      }
    }
    
    std::string fp4E2m1ManualScheduleName(
        const testing::TestParamInfo<Float4E2m1ManualScheduleTestParams>& info) {
      const auto& [vectorize_factor, dynamic_shape] = info.param;
      return "Vectorize" + std::to_string(vectorize_factor) + "_DynamicShape" +
          std::to_string(dynamic_shape);
    }
    
    INSTANTIATE_TEST_SUITE_P(
        ,
        Float4E2m1ManualScheduleTestAllArch,
        testing::Combine(
            testing::Values(1, 2, 4, 8, 16, 32),
            testing::Values(false, true)),
        fp4E2m1ManualScheduleName);
    
    using Float4E2m1SchedulerTestParams = std::tuple<int64_t, bool>;
    
    class Float4E2m1SchedulerTestAllArch
        : public NVFuserFixtureParamTest<Float4E2m1SchedulerTestParams> {
     protected:
      int64_t size;
      bool dynamic_shape;
      void SetUp() {
        std::tie(size, dynamic_shape) = GetParam();
      }
    };
    
    TEST_P(Float4E2m1SchedulerTestAllArch, CopyKernel) {
      auto fusion_ptr = std::make_unique<Fusion>();
      auto& fusion = *fusion_ptr;
      FusionGuard fg(fusion_ptr.get());
    
      TensorView* tv0 = dynamic_shape
          ? makeContigTensor(1, DataType::Float4_e2m1)
          : makeContigConcreteTensor({size}, DataType::Float4_e2m1);
      fusion.addInput(tv0);
      TensorView* tv1 = set(tv0);
      fusion.addOutput(tv1);
    
      auto options = at::TensorOptions().dtype(torch::kUInt8).device(at::kCUDA, 0);
      at::Tensor input = at::randint(0, 256, {size / 2}, options);
    
      FusionExecutorCache executor_cache(std::move(fusion_ptr));
      auto outputs = executor_cache.runFusionWithInputs({input});
    
      EXPECT_TRUE(outputs[0].as<at::Tensor>().equal(input));
    }
    
    std::string fp4E2m1SchedulerName(
        const testing::TestParamInfo<Float4E2m1SchedulerTestParams>& info) {
      const auto& [size, dynamic_shape] = info.param;
      return "Size" + std::to_string(size) + "_DynamicShape" +
          std::to_string(dynamic_shape);
    }
    
    INSTANTIATE_TEST_SUITE_P(
        ,
        Float4E2m1SchedulerTestAllArch,
        testing::Combine(
            testing::Values(
                2,
                64,
                64 + 2,
                1024,
                1024 + 2,
                64 * 1024,
                64 * 1024 + 2,
                1024 * 1024,
                1024 * 1024 + 2,
                64 * 1024 * 1024,
                64 * 1024 * 1024 + 2),
            testing::Values(false, true)),
        fp4E2m1SchedulerName);
    Test Coverage

    The new tests cover a wide range of vectorize factors and dynamic shapes, but it would be beneficial to ensure that the test cases are representative of real-world scenarios and that edge cases are also considered.

    TEST_P(Float4E2m1ManualScheduleTestAllArch, CopyKernel) {
      Fusion fusion;
      FusionGuard fg(&fusion);
    
      TensorView* tv0 = dynamic_shape
          ? makeContigTensor(1, DataType::Float4_e2m1)
          : makeContigConcreteTensor({2048}, DataType::Float4_e2m1);
      fusion.addInput(tv0);
      TensorView* tv1 = set(tv0);
      fusion.addOutput(tv1);
    
      tv1->split(0, vectorize_factor);
      tv1->axis(0)->parallelize(ParallelType::TIDx);
      tv1->axis(1)->parallelize(ParallelType::Vectorize);
    
      inlineMost();
    
      auto options = at::TensorOptions().dtype(torch::kUInt8).device(at::kCUDA, 0);
      at::Tensor input = at::randint(0, 256, {1024}, options);
    
      KernelExecutor ke;
      if (vectorize_factor == 1) {
        EXPECT_THAT(
            [&]() { ke.compile(&fusion, {input}); },
            testing::ThrowsMessage<nvfuser::nvfError>(testing::HasSubstr(
                "Tried to vectorize a dim resulting in a word size of 4 bits, "
                "however, vector sizes starting from and including 8 bits upto and "
                "including 128 bits are supported.")));
      } else {
        ke.compile(&fusion, {input});
        auto outputs = ke.run({input});
        EXPECT_TRUE(outputs[0].as<at::Tensor>().equal(input));
      }
    }
    
    std::string fp4E2m1ManualScheduleName(
        const testing::TestParamInfo<Float4E2m1ManualScheduleTestParams>& info) {
      const auto& [vectorize_factor, dynamic_shape] = info.param;
      return "Vectorize" + std::to_string(vectorize_factor) + "_DynamicShape" +
          std::to_string(dynamic_shape);
    }
    
    INSTANTIATE_TEST_SUITE_P(
        ,
        Float4E2m1ManualScheduleTestAllArch,
        testing::Combine(
            testing::Values(1, 2, 4, 8, 16, 32),
            testing::Values(false, true)),
        fp4E2m1ManualScheduleName);
    
    using Float4E2m1SchedulerTestParams = std::tuple<int64_t, bool>;
    
    class Float4E2m1SchedulerTestAllArch
        : public NVFuserFixtureParamTest<Float4E2m1SchedulerTestParams> {
     protected:
      int64_t size;
      bool dynamic_shape;
      void SetUp() {
        std::tie(size, dynamic_shape) = GetParam();
      }
    };
    
    TEST_P(Float4E2m1SchedulerTestAllArch, CopyKernel) {
      auto fusion_ptr = std::make_unique<Fusion>();
      auto& fusion = *fusion_ptr;
      FusionGuard fg(fusion_ptr.get());
    
      TensorView* tv0 = dynamic_shape
          ? makeContigTensor(1, DataType::Float4_e2m1)
          : makeContigConcreteTensor({size}, DataType::Float4_e2m1);
      fusion.addInput(tv0);
      TensorView* tv1 = set(tv0);
      fusion.addOutput(tv1);
    
      auto options = at::TensorOptions().dtype(torch::kUInt8).device(at::kCUDA, 0);
      at::Tensor input = at::randint(0, 256, {size / 2}, options);
    
      FusionExecutorCache executor_cache(std::move(fusion_ptr));
      auto outputs = executor_cache.runFusionWithInputs({input});
    
      EXPECT_TRUE(outputs[0].as<at::Tensor>().equal(input));
    }
    
    std::string fp4E2m1SchedulerName(
        const testing::TestParamInfo<Float4E2m1SchedulerTestParams>& info) {
      const auto& [size, dynamic_shape] = info.param;
      return "Size" + std::to_string(size) + "_DynamicShape" +
          std::to_string(dynamic_shape);
    }
    
    INSTANTIATE_TEST_SUITE_P(
        ,
        Float4E2m1SchedulerTestAllArch,
        testing::Combine(
            testing::Values(
                2,
                64,
                64 + 2,
                1024,
                1024 + 2,
                64 * 1024,
                64 * 1024 + 2,
                1024 * 1024,
                1024 * 1024 + 2,
                64 * 1024 * 1024,
                64 * 1024 * 1024 + 2),
            testing::Values(false, true)),
        fp4E2m1SchedulerName);
    Performance Metrics

    While new tests are added, there is no mention of performance metrics or benchmarks. It is crucial to include performance data to validate the effectiveness of the FP4 support in the pointwise scheduler.

    TEST_P(Float4E2m1ManualScheduleTestAllArch, CopyKernel) {
      Fusion fusion;
      FusionGuard fg(&fusion);
    
      TensorView* tv0 = dynamic_shape
          ? makeContigTensor(1, DataType::Float4_e2m1)
          : makeContigConcreteTensor({2048}, DataType::Float4_e2m1);
      fusion.addInput(tv0);
      TensorView* tv1 = set(tv0);
      fusion.addOutput(tv1);
    
      tv1->split(0, vectorize_factor);
      tv1->axis(0)->parallelize(ParallelType::TIDx);
      tv1->axis(1)->parallelize(ParallelType::Vectorize);
    
      inlineMost();
    
      auto options = at::TensorOptions().dtype(torch::kUInt8).device(at::kCUDA, 0);
      at::Tensor input = at::randint(0, 256, {1024}, options);
    
      KernelExecutor ke;
      if (vectorize_factor == 1) {
        EXPECT_THAT(
            [&]() { ke.compile(&fusion, {input}); },
            testing::ThrowsMessage<nvfuser::nvfError>(testing::HasSubstr(
                "Tried to vectorize a dim resulting in a word size of 4 bits, "
                "however, vector sizes starting from and including 8 bits upto and "
                "including 128 bits are supported.")));
      } else {
        ke.compile(&fusion, {input});
        auto outputs = ke.run({input});
        EXPECT_TRUE(outputs[0].as<at::Tensor>().equal(input));
      }
    }
    
    std::string fp4E2m1ManualScheduleName(
        const testing::TestParamInfo<Float4E2m1ManualScheduleTestParams>& info) {
      const auto& [vectorize_factor, dynamic_shape] = info.param;
      return "Vectorize" + std::to_string(vectorize_factor) + "_DynamicShape" +
          std::to_string(dynamic_shape);
    }
    
    INSTANTIATE_TEST_SUITE_P(
        ,
        Float4E2m1ManualScheduleTestAllArch,
        testing::Combine(
            testing::Values(1, 2, 4, 8, 16, 32),
            testing::Values(false, true)),
        fp4E2m1ManualScheduleName);
    
    using Float4E2m1SchedulerTestParams = std::tuple<int64_t, bool>;
    
    class Float4E2m1SchedulerTestAllArch
        : public NVFuserFixtureParamTest<Float4E2m1SchedulerTestParams> {
     protected:
      int64_t size;
      bool dynamic_shape;
      void SetUp() {
        std::tie(size, dynamic_shape) = GetParam();
      }
    };
    
    TEST_P(Float4E2m1SchedulerTestAllArch, CopyKernel) {
      auto fusion_ptr = std::make_unique<Fusion>();
      auto& fusion = *fusion_ptr;
      FusionGuard fg(fusion_ptr.get());
    
      TensorView* tv0 = dynamic_shape
          ? makeContigTensor(1, DataType::Float4_e2m1)
          : makeContigConcreteTensor({size}, DataType::Float4_e2m1);
      fusion.addInput(tv0);
      TensorView* tv1 = set(tv0);
      fusion.addOutput(tv1);
    
      auto options = at::TensorOptions().dtype(torch::kUInt8).device(at::kCUDA, 0);
      at::Tensor input = at::randint(0, 256, {size / 2}, options);
    
      FusionExecutorCache executor_cache(std::move(fusion_ptr));
      auto outputs = executor_cache.runFusionWithInputs({input});
    
      EXPECT_TRUE(outputs[0].as<at::Tensor>().equal(input));
    }
    
    std::string fp4E2m1SchedulerName(
        const testing::TestParamInfo<Float4E2m1SchedulerTestParams>& info) {
      const auto& [size, dynamic_shape] = info.param;
      return "Size" + std::to_string(size) + "_DynamicShape" +
          std::to_string(dynamic_shape);
    }
    
    INSTANTIATE_TEST_SUITE_P(
        ,
        Float4E2m1SchedulerTestAllArch,
        testing::Combine(
            testing::Values(
                2,
                64,
                64 + 2,
                1024,
                1024 + 2,
                64 * 1024,
                64 * 1024 + 2,
                1024 * 1024,
                1024 * 1024 + 2,
                64 * 1024 * 1024,
                64 * 1024 * 1024 + 2),
            testing::Values(false, true)),
        fp4E2m1SchedulerName);

    Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
    Labels
    None yet
    Projects
    None yet
    Development

    Successfully merging this pull request may close these issues.

    1 participant