Skip to content

max_persistent_buffer_size may be smaller than total_reduction_numel #4075

@naoyam

Description

@naoyam

Repro:

TEST_F(PersistentBufferTest, BroadcastDivByZero) {
  auto fusion_ptr = std::make_unique<Fusion>();
  auto& fusion = *fusion_ptr;
  FusionGuard fg(fusion_ptr.get());

  auto tv0 = makeSymbolicTensor(1, DataType::BFloat16);
  fusion.addInput(tv0);
  auto tv1 = makeSymbolicTensor(2, DataType::BFloat16);
  fusion.addInput(tv1);

  auto tv2 = broadcast(tv0, {false, true});
  auto tv3 = castOp(DataType::Float, tv2);
  auto tv4 = castOp(DataType::Float, tv1);
  auto tv5 = add(tv3, tv4);
  auto tv6 = sum(tv5, {0, 1});
  auto tv7 = broadcast(tv6, {true});

  auto tv8 = castOp(DataType::Float, tv0);
  auto tv9 = add(tv8, tv7);
  auto tv10 = castOp(DataType::BFloat16, tv9);
  fusion.addOutput(tv10);

  fusion.printMath();

  auto options = at::TensorOptions().dtype(at::kBFloat16).device(at::kCUDA, 0);
  auto t0 = at::randn({64}, options);
  auto t1 = at::randn({64, 16}, options);
  SchedulerRuntimeInfo runtime_info(fusion_ptr.get(), {t0, t1});
  ASSERT_TRUE(Schedule::canSchedule(
      SchedulerType::InnerPersistent, fusion_ptr.get(), runtime_info));
  auto scheduler =
      SchedulerEntry::makeSchedulerInstance(SchedulerType::InnerPersistent);
  auto heuristic_params =
      scheduler->computeHeuristics(fusion_ptr.get(), runtime_info);
  scheduler->schedule(fusion_ptr.get(), heuristic_params.get());
}

This results in a divsion-by-zero error:

0x0000555556a951d6 in nvfuser::scheduler_utils::safeDiv (x=224, y=0) at /home/nmaruyama/nvfuser/debug1/csrc/scheduler/utils.h:104
104       return std::max(x / y, (int64_t)1);
(gdb) bt
#0  0x0000555556a951d6 in nvfuser::scheduler_utils::safeDiv (x=224, y=0) at /home/nmaruyama/nvfuser/debug1/csrc/scheduler/utils.h:104
#1  0x0000555556a8f485 in nvfuser::(anonymous namespace)::getMaxPersistentBatch (buffer_bytes_per_batch=0, target_threads_per_sm=896, register_overhead=16, is_high_bandwidth_flops_ratio=false)
    at /home/nmaruyama/nvfuser/debug1/csrc/scheduler/normalization_inner.cpp:147
#2  0x0000555556a8dde8 in nvfuser::(anonymous namespace)::innerPersistentHeuristic2D (properties=..., rparams=0x55555b573f90) at /home/nmaruyama/nvfuser/debug1/csrc/scheduler/normalization_inner.cpp:414

This is because the persistent buffer, t0, has a smaller number of elements (64) than the number of reduced elements (16 * 64).

Metadata

Metadata

Assignees

Labels

No labels
No labels

Type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions