-
Notifications
You must be signed in to change notification settings - Fork 75
Open
Description
Issue: Preseg pass is disabled in TransposeTest, so squeeze is not peeled off from the fusion it leads to BFS traversal failure in IndexLowering lowering pass.
To reproduce:
TEST_F(TransposeTest, TMP1) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
FusionGuard fg(&fusion);
auto tv0 = makeContigConcreteTensor({1, 2048}, DataType::Float);
auto tv1 = makeContigConcreteTensor({1, 2048, 12288}, DataType::Float);
auto tv2 = makeContigConcreteTensor({1, 2048, 1}, DataType::Float);
fusion.addInput(tv0);
fusion.addInput(tv1);
fusion.addInput(tv2);
auto tv3 = broadcast(tv0, {false, false, true});
auto tv4 = sub(tv3, tv1);
auto tv5 = mul(tv4, tv2);
auto tv6 = squeeze(tv5, std::vector<bool>{true, false, false});
fusion.addOutput(tv6);
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
auto t0 = at::randn({1, 2048}, options);
auto t1 = at::randn({1, 2048, 12288}, options);
auto t2 = at::randn({1, 2048, 1}, options);
FusionExecutorCache executor_cache(std::move(fusion_ptr));
auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1, t2});
testValidate(executor_cache.fusion(), cg_outputs, {t0, t1, t2}, __LINE__, __FILE__);
}
Error if fusion is not segmented at squeeze op.
Inputs:
T0_g_float[bS0{1}, iS1{2048}]
T1_g_float[bS2{1}, iS3{2048}, iS4{12288}]
T2_g_float[bS5{1}, iS6{2048}, bS7{1}]
Outputs:
T6_g_float[iS17{2048}, iS18{12288}]
%kernel_math {
T3_l_float[bS8{1}, iS9{2048}, bS10{1}]
= broadcast( T0_g_float[bS0{1}, iS1{2048}], flags = {false, false, true} )
T4_l_float[bS11{1}, iS12{2048}, iS13{12288}]
= T3_l_float[bS8{1}, iS9{2048}, bS10{1}]
- T1_g_float[bS2{1}, iS3{2048}, iS4{12288}];
T5_l_float[bS14{1}, iS15{2048}, iS16{12288}]
= T4_l_float[bS11{1}, iS12{2048}, iS13{12288}]
* T2_g_float[bS5{1}, iS6{2048}, bS7{1}];
T6_g_float[iS17{2048}, iS18{12288}]
= squeeze( T5_l_float[bS14{1}, iS15{2048}, iS16{12288}], flags = {true, false, false} )
} // %kernel_math
unknown file: Failure
C++ exception with description " INTERNAL ASSERT FAILED at /opt/pytorch/nvfuser/csrc/bfs.h:261, please report a bug with repro script to NVFuser at https://github.com/NVIDIA/Fuser/issues.
BFS traversal could not visit some nodes: idg{111 116} idg{112 117} idg{110 115} (from: idg{41 42 48 49 55 56 76 77 83 84 90 91 188 190} idg{43 50 57 78 85 92 191} idg{154 159 164 169 174 179 198} idg{155 160 165 170 175 180 199} idg{153 158 163 168 173 178 197 201} idg{0 2 5 8 11 14 27 29 32}), visited: ( idg{193} idg{141 145} idg{140 144} idg{4 13 16 18 31 36} idg{155 160 165 170 175 180 199} idg{40 45 52 59 66 73 80 87 94 101 105} idg{37 46 53 74 81 88 186} idg{153 158 163 168 173 178 197 201} idg{195} idg{194} idg{154 159 164 169 174 179 198} idg{0 2 5 8 11 14 27 29 32} idg{43 50 57 78 85 92 191} idg{60 67 95} idg{41 42 48 49 55 56 76 77 83 84 90 91 188 190} idg{192} idg{39 44 51 58 62 63 65 69 70 72 79 86 93 97 98 100 102 104 106} idg{152 157 162 167 172 177 196 200} idg{64 71 99 103 107} idg{151 156 161 166 171 176} idg{38 47 54 75 82 89 187} idg{138 142} idg{139 143} idg{189} idg{1 3 6 9 12 15 17 28 30 33 35})
No error if fusion is segmented at squeeze op (use NVFuserTest to allow preseg pass to segment at squeeze op)
Inputs:
T7_g_float[bS19{1}, iS20{2048}, iS21{12288}]
Outputs:
T6_g_float[iS17{2048}, iS18{12288}]
%kernel_math {
T6_g_float[iS17{2048}, iS18{12288}]
= squeeze( T7_g_float[bS19{1}, iS20{2048}, iS21{12288}], flags = {true, false, false} )
} // %kernel_math
Inputs:
T2_g_float[bS5{1}, iS6{2048}, bS7{1}]
T0_g_float[bS0{1}, iS1{2048}]
T1_g_float[bS2{1}, iS3{2048}, iS4{12288}]
Outputs:
T7_g_float[bS19{1}, iS20{2048}, iS21{12288}]
%kernel_math {
T3_l_float[bS8{1}, iS9{2048}, bS10{1}]
= broadcast( T0_g_float[bS0{1}, iS1{2048}], flags = {false, false, true} )
T4_g_float[bS11{1}, iS12{2048}, iS13{12288}]
= T3_l_float[bS8{1}, iS9{2048}, bS10{1}]
- T1_g_float[bS2{1}, iS3{2048}, iS4{12288}];
T5_g_float[bS14{1}, iS15{2048}, iS16{12288}]
= T4_g_float[bS11{1}, iS12{2048}, iS13{12288}]
* T2_g_float[bS5{1}, iS6{2048}, bS7{1}];
T7_g_float[bS19{1}, iS20{2048}, iS21{12288}]
= SegmenterSet( T5_g_float[bS14{1}, iS15{2048}, iS16{12288}] )
} // %kernel_math
Context of the original issue:
while working on heuristic optimization of inner-outer persistent scheduler, noticed test failure of tests/python/multidevice/test_transformer.py::test_transformer_backward[TENSOR_PARALLEL], it has a segmented fusion contains a squeeze op.
g{(transpose)
group id: 24
inputs:
T4_g_bool[bS11{1}, iS12{2048}, iS13{12288}] (DeviceMesh{0}) bool
T5_g___bfloat[bS14{1}, iS15{2048}, iS16{12288}] (DeviceMesh{0}) __bfloat
T7_g___bfloat[iS19{12288}] (DeviceMesh{0}) __bfloat
T8_g_float[bS20{1}, iS21{2048}] (DeviceMesh{0}) float
T9_g___bfloat[bS22{1}, iS23{2048}, iS24{12288}] (DeviceMesh{0}) __bfloat
T10_g_float[bS25{1}, iS26{2048}, bS27{1}] (DeviceMesh{0}) float
T22_g___bfloat[iS50{12288}] (DeviceMesh{0}) __bfloat
outputs:
T64_g___bfloat[bS172{1}, bS173{1 ex 2048}, iS174{12288}] (DeviceMesh{0}) __bfloat
T74_g_float[bS202{1}, iS203{2048}, iS204{12288}] (DeviceMesh{0}) float
T80_g_float[bS218{1}, iS219{2048}, bS220{1 ex 12288}] (DeviceMesh{0}) float
T102_g_float[bS274{1}, iS275{2048}, iS276{12288}] (DeviceMesh{0}) float
T187_g_float[bS548{1}, iS549{2048}, iS550{12288}] (DeviceMesh{0}) float
T229_g___bfloat[iS662{2048}, iS663{12288}] (DeviceMesh{0}) __bfloat
T59_g_float[bS157{1}, iS158{2048}, iS159{12288}] (DeviceMesh{0})
= __bfloat2float(T5_g___bfloat[bS14{1}, iS15{2048}, iS16{12288}] (DeviceMesh{0}));
(36)
T185_l___bfloat[bS542{1}, bS543{1}, iS544{12288}] (DeviceMesh{0})
= broadcast( T22_g___bfloat[iS50{12288}] (DeviceMesh{0}), flags = {true, true, false} )
(176)
T186_g___bfloat[bS545{1}, bS546{1 ex 2048}, iS547{12288}] (DeviceMesh{0}) = expand( T185_l___bfloat[bS542{1}, bS543{1}, iS544{12288}] (DeviceMesh{0}) )
(270)
T63_l___bfloat[bS169{1}, bS170{1}, iS171{12288}] (DeviceMesh{0})
= broadcast( T7_g___bfloat[iS19{12288}] (DeviceMesh{0}), flags = {true, true, false} )
(40)
T64_g___bfloat[bS172{1}, bS173{1 ex 2048}, iS174{12288}] (DeviceMesh{0}) = expand( T63_l___bfloat[bS169{1}, bS170{1}, iS171{12288}] (DeviceMesh{0}) )
(269)
T70_l_float[bS190{1}, bS191{1 ex 2048}, iS192{12288}] (DeviceMesh{0})
= __bfloat2float(T64_g___bfloat[bS172{1}, bS173{1 ex 2048}, iS174{12288}] (DeviceMesh{0}));
(47)
T193_l_float[bS567{1}, bS568{1 ex 2048}, iS569{12288}] (DeviceMesh{0})
= __bfloat2float(T186_g___bfloat[bS545{1}, bS546{1 ex 2048}, iS547{12288}] (DeviceMesh{0}));
(184)
T93_l_float[bS251{1}, iS252{2048}, bS253{1}] (DeviceMesh{0})
= broadcast( T8_g_float[bS20{1}, iS21{2048}] (DeviceMesh{0}), flags = {false, false, true} )
(70)
T94_g_float[bS254{1}, iS255{2048}, bS256{1}] (DeviceMesh{0})
= Set( T93_l_float[bS251{1}, iS252{2048}, bS253{1}] (DeviceMesh{0}), cache_op=Streaming )
(71)
T98_g_float[bS264{1}, iS265{2048}, bS266{1 ex 12288}] (DeviceMesh{0}) = expand( T94_g_float[bS254{1}, iS255{2048}, bS256{1}] (DeviceMesh{0}) )
(273)
T65_l_float[bS175{1}, iS176{2048}, bS177{1}] (DeviceMesh{0})
= broadcast( T8_g_float[bS20{1}, iS21{2048}] (DeviceMesh{0}), flags = {false, false, true} )
(42)
T66_g_float[bS178{1}, iS179{2048}, bS180{1}] (DeviceMesh{0})
= Set( T65_l_float[bS175{1}, iS176{2048}, bS177{1}] (DeviceMesh{0}), cache_op=Streaming )
(43)
T71_g_float[bS193{1}, iS194{2048}, bS195{1 ex 12288}] (DeviceMesh{0}) = expand( T66_g_float[bS178{1}, iS179{2048}, bS180{1}] (DeviceMesh{0}) )
(267)
T68_l_float[bS184{1}, iS185{2048}, iS186{12288}] (DeviceMesh{0})
= __bfloat2float(T9_g___bfloat[bS22{1}, iS23{2048}, iS24{12288}] (DeviceMesh{0}));
(45)
T58_l_float[bS154{1}, iS155{2048}, iS156{12288}] (DeviceMesh{0})
= __to_float(T4_g_bool[bS11{1}, iS12{2048}, iS13{12288}] (DeviceMesh{0}));
(35)
T61_l_float[bS163{1}, iS164{2048}, iS165{12288}] (DeviceMesh{0})
= T59_g_float[bS157{1}, iS158{2048}, iS159{12288}] (DeviceMesh{0})
* T58_l_float[bS154{1}, iS155{2048}, iS156{12288}] (DeviceMesh{0});
(38)
T67_l_float[bS181{1}, iS182{2048}, iS183{12288}] (DeviceMesh{0})
= T61_l_float[bS163{1}, iS164{2048}, iS165{12288}] (DeviceMesh{0})
* double(1.11111);
(44)
T72_g_float[bS196{1}, iS197{2048}, iS198{12288}] (DeviceMesh{0})
= T68_l_float[bS184{1}, iS185{2048}, iS186{12288}] (DeviceMesh{0})
+ T67_l_float[bS181{1}, iS182{2048}, iS183{12288}] (DeviceMesh{0});
(49)
T102_g_float[bS274{1}, iS275{2048}, iS276{12288}] (DeviceMesh{0})
= T72_g_float[bS196{1}, iS197{2048}, iS198{12288}] (DeviceMesh{0})
- T98_g_float[bS264{1}, iS265{2048}, bS266{1 ex 12288}] (DeviceMesh{0});
(79)
T80_g_float[bS218{1}, iS219{2048}, bS220{1 ex 12288}] (DeviceMesh{0}) = expand( T10_g_float[bS25{1}, iS26{2048}, bS27{1}] (DeviceMesh{0}) )
(268)
T74_g_float[bS202{1}, iS203{2048}, iS204{12288}] (DeviceMesh{0})
= T72_g_float[bS196{1}, iS197{2048}, iS198{12288}] (DeviceMesh{0})
- T71_g_float[bS193{1}, iS194{2048}, bS195{1 ex 12288}] (DeviceMesh{0});
(51)
T187_g_float[bS548{1}, iS549{2048}, iS550{12288}] (DeviceMesh{0})
= T74_g_float[bS202{1}, iS203{2048}, iS204{12288}] (DeviceMesh{0})
* T80_g_float[bS218{1}, iS219{2048}, bS220{1 ex 12288}] (DeviceMesh{0});
(178)
T194_l_float[bS570{1}, iS571{2048}, iS572{12288}] (DeviceMesh{0})
= T187_g_float[bS548{1}, iS549{2048}, iS550{12288}] (DeviceMesh{0})
* T70_l_float[bS190{1}, bS191{1 ex 2048}, iS192{12288}] (DeviceMesh{0});
(185)
T199_l_float[bS586{1}, iS587{2048}, iS588{12288}] (DeviceMesh{0})
= T194_l_float[bS570{1}, iS571{2048}, iS572{12288}] (DeviceMesh{0})
+ T193_l_float[bS567{1}, bS568{1 ex 2048}, iS569{12288}] (DeviceMesh{0});
(191)
T208_g___bfloat[bS617{1}, iS618{2048}, iS619{12288}] (DeviceMesh{0})
= __float2bfloat(T199_l_float[bS586{1}, iS587{2048}, iS588{12288}] (DeviceMesh{0}));
(201)
T229_g___bfloat[iS662{2048}, iS663{12288}] (DeviceMesh{0})
= squeeze( T208_g___bfloat[bS617{1}, iS618{2048}, iS619{12288}] (DeviceMesh{0}), flags = {true, false, false} )
(222)
}
Metadata
Metadata
Assignees
Labels
No labels