Skip to content

Commit fe71802

Browse files
committed
Profile nested mtasks
Signed-off-by: Bartłomiej Chmiel <[email protected]>
1 parent cbabd8a commit fe71802

9 files changed

+179
-147
lines changed

bin/verilator_gantt

Lines changed: 59 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,9 @@ def read_data(filename):
3737
with open(filename, "r", encoding="utf8") as fh:
3838
re_thread = re.compile(r'^VLPROFTHREAD (\d+)$')
3939
re_record = re.compile(r'^VLPROFEXEC (\S+) (\d+)(.*)$')
40-
re_payload_mtaskBegin = re.compile(r'id (\d+) predictStart (\d+) cpu (\d+)')
41-
re_payload_mtaskEnd = re.compile(r'id (\d+) predictCost (\d+)')
40+
re_payload_mtaskBegin = re.compile(
41+
r'id (\d+) predictStart (\d+) cpu (\d+) module\s*(\w+)?')
42+
re_payload_mtaskEnd = re.compile(r'predictCost (\d+)')
4243
re_payload_wait = re.compile(r'cpu (\d+)')
4344

4445
re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.]*)\s*')
@@ -56,8 +57,8 @@ def read_data(filename):
5657

5758
ExecGraphStack = []
5859
SectionStack = []
60+
MtasksStack = []
5961
ThreadScheduleWait = collections.defaultdict(list)
60-
mTaskThread = {}
6162

6263
for line in fh:
6364
recordMatch = re_record.match(line)
@@ -74,33 +75,35 @@ def read_data(filename):
7475
SectionStack.pop()
7576
Sections[thread].append((tick, tuple(SectionStack)))
7677
elif kind == "MTASK_BEGIN":
77-
mtask, predict_start, ecpu = re_payload_mtaskBegin.match(payload).groups()
78+
mtask, predict_start, ecpu, module = re_payload_mtaskBegin.match(
79+
payload).groups()
7880
mtask = int(mtask)
7981
predict_start = int(predict_start)
8082
ecpu = int(ecpu)
81-
mTaskThread[mtask] = thread
8283
records = Threads[thread]
83-
assert not records or records[-1]['start'] <= records[-1]['end'] <= tick
8484
records.append({
8585
'start': tick,
8686
'mtask': mtask,
8787
'predict_start': predict_start,
88+
'module': module,
8889
'cpu': ecpu
8990
})
90-
Mtasks[mtask]['begin'] = tick
91-
Mtasks[mtask]['thread'] = thread
92-
Mtasks[mtask]['predict_start'] = predict_start
91+
Mtasks[(module, mtask)]['begin'] = tick
92+
Mtasks[(module, mtask)]['predict_start'] = predict_start
93+
Mtasks[(module, mtask)]['thread'] = thread
94+
MtasksStack.append((module, mtask, records[-1]))
9395
elif kind == "MTASK_END":
94-
mtask, predict_cost = re_payload_mtaskEnd.match(payload).groups()
96+
predict_cost, = re_payload_mtaskEnd.match(payload).groups()
9597
mtask = int(mtask)
98+
module, mtask, record = MtasksStack.pop()
9699
predict_cost = int(predict_cost)
97-
begin = Mtasks[mtask]['begin']
98-
record = Threads[mTaskThread[mtask]][-1]
100+
begin = Mtasks[(module, mtask)]['begin']
99101
record['end'] = tick
102+
assert record and records[-1]['start'] <= records[-1]['end'] <= tick
100103
record['predict_cost'] = predict_cost
101-
Mtasks[mtask]['elapsed'] += tick - begin
102-
Mtasks[mtask]['predict_cost'] = predict_cost
103-
Mtasks[mtask]['end'] = max(Mtasks[mtask]['end'], tick)
104+
Mtasks[(module, mtask)]['elapsed'] += tick - begin
105+
Mtasks[(module, mtask)]['predict_cost'] = predict_cost
106+
Mtasks[(module, mtask)]['end'] = max(Mtasks[(module, mtask)]['end'], tick)
104107
elif kind == "THREAD_SCHEDULE_WAIT_BEGIN":
105108
ecpu = int(re_payload_wait.match(payload).groups()[0])
106109
ThreadScheduleWait[ecpu].append(tick)
@@ -234,8 +237,8 @@ def report_mtasks():
234237
long_mtask = None
235238
predict_mtask_time = 0
236239
predict_elapsed = 0
237-
for mtaskId in Mtasks:
238-
record = Mtasks[mtaskId]
240+
for (module, mtaskId) in Mtasks:
241+
record = Mtasks[(module, mtaskId)]
239242
predict_mtask_time += record['predict_cost']
240243
total_mtask_time += record['elapsed']
241244
thread_mtask_time[record['thread']] += record['elapsed']
@@ -244,6 +247,7 @@ def report_mtasks():
244247
if record['elapsed'] > long_mtask_time:
245248
long_mtask_time = record['elapsed']
246249
long_mtask = mtaskId
250+
long_mtask_module = module
247251
Global['predict_last_end'] = predict_elapsed
248252

249253
serialTime = ElapsedTime - ExecGraphTime
@@ -272,31 +276,51 @@ def report_mtasks():
272276
max_p2e = -1000000
273277
max_mtask = None
274278

275-
for mtask in sorted(Mtasks.keys()):
276-
if Mtasks[mtask]['elapsed'] > 0:
277-
if Mtasks[mtask]['predict_cost'] == 0:
278-
Mtasks[mtask]['predict_cost'] = 1 # don't log(0) below
279-
p2e_ratio = math.log(Mtasks[mtask]['predict_cost'] / Mtasks[mtask]['elapsed'])
279+
for (module, mtaskId) in sorted(Mtasks.keys()):
280+
mtask = Mtasks[(module, mtaskId)]
281+
if mtask['elapsed'] > 0:
282+
if mtask['predict_cost'] == 0:
283+
mtask['predict_cost'] = 1 # don't log(0) below
284+
p2e_ratio = math.log(mtask['predict_cost'] / mtask['elapsed'])
280285
p2e_ratios.append(p2e_ratio)
281286

282287
if p2e_ratio > max_p2e:
283288
max_p2e = p2e_ratio
284-
max_mtask = mtask
289+
max_mtask = mtaskId
290+
max_module = module
285291
if p2e_ratio < min_p2e:
286292
min_p2e = p2e_ratio
287-
min_mtask = mtask
293+
min_mtask = mtaskId
294+
min_module = module
288295

289296
print("\nMTask statistics:")
290-
print(" Longest mtask id = {}".format(long_mtask))
297+
if long_mtask_module:
298+
print(" Longest mtask id = {} from module '{}'".format(long_mtask, long_mtask_module))
299+
else:
300+
print(" Longest mtask id = {}".format(long_mtask))
291301
print(" Longest mtask time = {:.2%} of time elapsed in parallelized code".format(
292302
long_mtask_time / ExecGraphTime))
293303
print(" min log(p2e) = %0.3f" % min_p2e, end="")
294304

295-
print(" from mtask %d (predict %d," % (min_mtask, Mtasks[min_mtask]['predict_cost']), end="")
296-
print(" elapsed %d)" % Mtasks[min_mtask]['elapsed'])
305+
if min_module:
306+
print(" from module '%s' mtask %d (predict %d," %
307+
(min_module, min_mtask, Mtasks[(min_module, min_mtask)]['predict_cost']),
308+
end="")
309+
else:
310+
print(" from mtask %d (predict %d," %
311+
(min_mtask, Mtasks[(min_module, min_mtask)]['predict_cost']),
312+
end="")
313+
print(" elapsed %d)" % Mtasks[(min_module, min_mtask)]['elapsed'])
297314
print(" max log(p2e) = %0.3f" % max_p2e, end="")
298-
print(" from mtask %d (predict %d," % (max_mtask, Mtasks[max_mtask]['predict_cost']), end="")
299-
print(" elapsed %d)" % Mtasks[max_mtask]['elapsed'])
315+
if max_module:
316+
print(" from module '%s' mtask %d (predict %d," %
317+
(max_module, max_mtask, Mtasks[(max_module, max_mtask)]['predict_cost']),
318+
end="")
319+
else:
320+
print(" from mtask %d (predict %d," %
321+
(max_mtask, Mtasks[(max_module, max_mtask)]['predict_cost']),
322+
end="")
323+
print(" elapsed %d)" % Mtasks[(max_module, max_mtask)]['elapsed'])
300324

301325
stddev = statistics.pstdev(p2e_ratios)
302326
mean = statistics.mean(p2e_ratios)
@@ -482,17 +506,17 @@ def write_vcd(filename):
482506
# Compute scale so predicted graph is of same width as interval
483507
measured_scaling = (end - start) / Global['predict_last_end']
484508
# Predict mtasks that fill the time the execution occupied
485-
for mtask in Mtasks:
486-
thread = Mtasks[mtask]['thread']
487-
pred_scaled_start = start + int(Mtasks[mtask]['predict_start'] * measured_scaling)
509+
for (module, mtaskId) in Mtasks:
510+
mtask = Mtasks[(module, mtaskId)]
511+
thread = mtask['thread']
512+
pred_scaled_start = start + int(mtask['predict_start'] * measured_scaling)
488513
pred_scaled_end = start + int(
489-
(Mtasks[mtask]['predict_start'] + Mtasks[mtask]['predict_cost']) *
490-
measured_scaling)
514+
(mtask['predict_start'] + mtask['predict_cost']) * measured_scaling)
491515
if pred_scaled_start == pred_scaled_end:
492516
continue
493517

494518
mcode = getCode(32, 'predicted', 't%d_mtask' % thread)
495-
addValue(mcode, pred_scaled_start, mtask)
519+
addValue(mcode, pred_scaled_start, mtaskId)
496520
addValue(mcode, pred_scaled_end, None)
497521

498522
parallelism['predicted'][pred_scaled_start] += 1

include/verilated_profiler.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -190,13 +190,13 @@ void VlExecutionProfiler::dump(const char* filenamep, uint64_t tickEnd)
190190
break;
191191
case VlExecutionRecord::Type::MTASK_BEGIN: {
192192
const auto& payload = er.m_payload.mtaskBegin;
193-
fprintf(fp, " id %u predictStart %u cpu %u\n", payload.m_id,
194-
payload.m_predictStart, payload.m_cpu);
193+
fprintf(fp, " id %u predictStart %u cpu %u module %s\n", payload.m_id,
194+
payload.m_predictStart, payload.m_cpu, payload.m_module);
195195
break;
196196
}
197197
case VlExecutionRecord::Type::MTASK_END: {
198198
const auto& payload = er.m_payload.mtaskEnd;
199-
fprintf(fp, " id %u predictCost %u\n", payload.m_id, payload.m_predictCost);
199+
fprintf(fp, " predictCost %u\n", payload.m_predictCost);
200200
break;
201201
}
202202
case VlExecutionRecord::Type::THREAD_SCHEDULE_WAIT_BEGIN:

include/verilated_profiler.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -92,9 +92,9 @@ class VlExecutionRecord final {
9292
uint32_t m_id; // MTask id
9393
uint32_t m_predictStart; // Time scheduler predicted would start
9494
uint32_t m_cpu; // Executing CPU id
95+
const char* m_module; // Name of module with this mtask
9596
} mtaskBegin;
9697
struct {
97-
uint32_t m_id; // MTask id
9898
uint32_t m_predictCost; // How long scheduler predicted would take
9999
} mtaskEnd;
100100
struct {
@@ -120,14 +120,14 @@ class VlExecutionRecord final {
120120
m_type = Type::SECTION_PUSH;
121121
}
122122
void sectionPop() { m_type = Type::SECTION_POP; }
123-
void mtaskBegin(uint32_t id, uint32_t predictStart) {
123+
void mtaskBegin(uint32_t id, uint32_t predictStart, const char* moduleName) {
124124
m_payload.mtaskBegin.m_id = id;
125125
m_payload.mtaskBegin.m_predictStart = predictStart;
126126
m_payload.mtaskBegin.m_cpu = VlOs::getcpu();
127+
m_payload.mtaskBegin.m_module = moduleName;
127128
m_type = Type::MTASK_BEGIN;
128129
}
129-
void mtaskEnd(uint32_t id, uint32_t predictCost) {
130-
m_payload.mtaskEnd.m_id = id;
130+
void mtaskEnd(uint32_t predictCost) {
131131
m_payload.mtaskEnd.m_predictCost = predictCost;
132132
m_type = Type::MTASK_END;
133133
}

src/V3ExecGraph.cpp

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -916,18 +916,15 @@ void wrapMTaskBodies(AstExecGraph* const execGraphp) {
916916
funcp->addStmtsp(new AstCStmt{flp, stmt});
917917
};
918918

919-
if (v3Global.opt.hierChild() || !v3Global.opt.hierBlocks().empty()) {
920-
addStrStmt(
921-
"static const unsigned taskId = vlSymsp->__Vm_threadPoolp->assignTaskIndex();\n");
922-
} else {
923-
const string& id = std::to_string(mtaskp->id());
924-
addStrStmt("static constexpr unsigned taskId = " + id + ";\n");
925-
}
919+
addStrStmt("static constexpr unsigned taskId = " + cvtToStr(mtaskp->id()) + ";\n");
926920

927-
if (v3Global.opt.profExec() && mtaskp->threads() <= 1) {
921+
if (v3Global.opt.profExec()) {
928922
const string& predictStart = std::to_string(mtaskp->predictStart());
923+
// We use top module flag as it is always set for all hierarchical phases, when it is
924+
// not set we assume that it is a non-hierarchical run and we don't need module name
925+
// anyway.
929926
addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).mtaskBegin(taskId, " + predictStart
930-
+ ");\n");
927+
+ ", \"" + v3Global.opt.topModule() + "\");\n");
931928
}
932929

933930
// Set mtask ID in the run-time system
@@ -939,10 +936,9 @@ void wrapMTaskBodies(AstExecGraph* const execGraphp) {
939936
// Flush message queue
940937
addStrStmt("Verilated::endOfThreadMTask(vlSymsp->__Vm_evalMsgQp);\n");
941938

942-
if (v3Global.opt.profExec() && mtaskp->threads() <= 1) {
943-
const string& predictConst = std::to_string(mtaskp->cost());
944-
addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).mtaskEnd(taskId, " + predictConst
945-
+ ");\n");
939+
if (v3Global.opt.profExec()) {
940+
const string& predictCost = std::to_string(mtaskp->cost());
941+
addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).mtaskEnd(" + predictCost + ");\n");
946942
}
947943

948944
// AstMTask will simply contain a call

test_regress/t/t_gantt_io.dat

Lines changed: 30 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -902,40 +902,42 @@ VLPROFPROC power management: ts ttp tm hwpstate cpb eff_freq_ro [13] [14]
902902
VLPROFPROC
903903
VLPROFTHREAD 0
904904
VLPROFEXEC EXEC_GRAPH_BEGIN 945
905-
VLPROFEXEC MTASK_BEGIN 2695 id 6 predictStart 0 cpu 19
906-
VLPROFEXEC MTASK_END 2905 id 6 predictCost 30
907-
VLPROFEXEC MTASK_BEGIN 9695 id 10 predictStart 196 cpu 19
908-
VLPROFEXEC MTASK_END 9870 id 10 predictCost 30
905+
VLPROFEXEC MTASK_BEGIN 2695 id 6 predictStart 0 cpu 19 module top
906+
VLPROFEXEC MTASK_BEGIN 3795 id 10 predictStart 196 cpu 19 module sub
907+
VLPROFEXEC MTASK_END 4850 predictCost 30
908+
VLPROFEXEC MTASK_END 5905 predictCost 30
909+
VLPROFEXEC MTASK_BEGIN 9695 id 10 predictStart 196 cpu 19 module top
910+
VLPROFEXEC MTASK_END 9870 predictCost 30
909911
VLPROFEXEC EXEC_GRAPH_END 12180
910912
VLPROFEXEC EXEC_GRAPH_BEGIN 14000
911-
VLPROFEXEC MTASK_BEGIN 15610 id 6 predictStart 0 cpu 19
912-
VLPROFEXEC MTASK_END 15820 id 6 predictCost 30
913+
VLPROFEXEC MTASK_BEGIN 15610 id 6 predictStart 0 cpu 19 module top
914+
VLPROFEXEC MTASK_END 15820 predictCost 30
913915
VLPROFEXEC THREAD_SCHEDULE_WAIT_BEGIN 20000 cpu 19
914916
VLPROFEXEC THREAD_SCHEDULE_WAIT_END 21000 cpu 19
915-
VLPROFEXEC MTASK_BEGIN 21700 id 10 predictStart 196 cpu 19
916-
VLPROFEXEC MTASK_END 21875 id 10 predictCost 30
917+
VLPROFEXEC MTASK_BEGIN 21700 id 10 predictStart 196 cpu 19 module top
918+
VLPROFEXEC MTASK_END 21875 predictCost 30
917919
VLPROFEXEC EXEC_GRAPH_END 22085
918920
VLPROFTHREAD 1
919-
VLPROFEXEC MTASK_BEGIN 5495 id 5 predictStart 0 cpu 10
920-
VLPROFEXEC MTASK_END 6090 id 5 predictCost 30
921-
VLPROFEXEC MTASK_BEGIN 6300 id 7 predictStart 30 cpu 10
922-
VLPROFEXEC MTASK_END 6895 id 7 predictCost 30
923-
VLPROFEXEC MTASK_BEGIN 7490 id 8 predictStart 60 cpu 10
924-
VLPROFEXEC MTASK_END 8540 id 8 predictCost 107
925-
VLPROFEXEC MTASK_BEGIN 9135 id 9 predictStart 167 cpu 10
926-
VLPROFEXEC MTASK_END 9730 id 9 predictCost 30
927-
VLPROFEXEC MTASK_BEGIN 10255 id 11 predictStart 197 cpu 10
928-
VLPROFEXEC MTASK_END 11060 id 11 predictCost 30
929-
VLPROFEXEC MTASK_BEGIN 18375 id 5 predictStart 0 cpu 10
930-
VLPROFEXEC MTASK_END 18970 id 5 predictCost 30
931-
VLPROFEXEC MTASK_BEGIN 19145 id 7 predictStart 30 cpu 10
932-
VLPROFEXEC MTASK_END 19320 id 7 predictCost 30
933-
VLPROFEXEC MTASK_BEGIN 19670 id 8 predictStart 60 cpu 10
934-
VLPROFEXEC MTASK_END 19810 id 8 predictCost 107
935-
VLPROFEXEC MTASK_BEGIN 20650 id 9 predictStart 167 cpu 10
936-
VLPROFEXEC MTASK_END 20720 id 9 predictCost 30
937-
VLPROFEXEC MTASK_BEGIN 21140 id 11 predictStart 197 cpu 10
938-
VLPROFEXEC MTASK_END 21245 id 11 predictCost 30
921+
VLPROFEXEC MTASK_BEGIN 5495 id 5 predictStart 0 cpu 10 module top
922+
VLPROFEXEC MTASK_END 6090 predictCost 30
923+
VLPROFEXEC MTASK_BEGIN 6300 id 7 predictStart 30 cpu 10 module top
924+
VLPROFEXEC MTASK_END 6895 predictCost 30
925+
VLPROFEXEC MTASK_BEGIN 7490 id 8 predictStart 60 cpu 10 module top
926+
VLPROFEXEC MTASK_END 8540 predictCost 107
927+
VLPROFEXEC MTASK_BEGIN 9135 id 9 predictStart 167 cpu 10 module top
928+
VLPROFEXEC MTASK_END 9730 predictCost 30
929+
VLPROFEXEC MTASK_BEGIN 10255 id 11 predictStart 197 cpu 10 module top
930+
VLPROFEXEC MTASK_END 11060 predictCost 30
931+
VLPROFEXEC MTASK_BEGIN 18375 id 5 predictStart 0 cpu 10 module top
932+
VLPROFEXEC MTASK_END 18970 predictCost 30
933+
VLPROFEXEC MTASK_BEGIN 19145 id 7 predictStart 30 cpu 10 module top
934+
VLPROFEXEC MTASK_END 19320 predictCost 30
935+
VLPROFEXEC MTASK_BEGIN 19670 id 8 predictStart 60 cpu 10 module top
936+
VLPROFEXEC MTASK_END 19810 predictCost 107
937+
VLPROFEXEC MTASK_BEGIN 20650 id 9 predictStart 167 cpu 10 module top
938+
VLPROFEXEC MTASK_END 20720 predictCost 30
939+
VLPROFEXEC MTASK_BEGIN 21140 id 11 predictStart 197 cpu 10 module top
940+
VLPROFEXEC MTASK_END 21245 predictCost 30
939941
VLPROFEXEC THREAD_SCHEDULE_WAIT_BEGIN 22000 cpu 10
940942
VLPROFEXEC THREAD_SCHEDULE_WAIT_END 23000 cpu 10
941943
VLPROF stat ticks 23415

test_regress/t/t_gantt_io.out

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,42 +10,42 @@ Summary:
1010
Waiting time = 8.54% of elapsed time
1111
Total threads = 2
1212
Total CPUs used = 2
13-
Total mtasks = 7
13+
Total mtasks = 8
1414
Total yields = 0
1515

1616
NUMA assignment:
1717
NUMA status = 0,1,4,5;2,3,6,7
1818

1919
Parallelized code, measured:
20-
Thread utilization = 14.22%
21-
Speedup = 0.284x
20+
Thread utilization = 24.72%
21+
Speedup = 0.494x
2222

2323
Parallelized code, predicted during static scheduling:
24-
Thread utilization = 63.22%
25-
Speedup = 1.26x
24+
Thread utilization = 69.82%
25+
Speedup = 1.4x
2626

2727
All code, measured:
28-
Thread utilization = 20.48%
29-
Speedup = 0.41x
28+
Thread utilization = 29.14%
29+
Speedup = 0.583x
3030

3131
All code, measured, scaled by predicted speedup:
32-
Thread utilization = 56.80%
33-
Speedup = 1.14x
32+
Thread utilization = 62.40%
33+
Speedup = 1.25x
3434

3535
MTask statistics:
36-
Longest mtask id = 5
37-
Longest mtask time = 6.16% of time elapsed in parallelized code
38-
min log(p2e) = -3.681 from mtask 5 (predict 30, elapsed 1190)
39-
max log(p2e) = -2.409 from mtask 8 (predict 107, elapsed 1190)
40-
mean = -2.992
41-
stddev = 0.459
42-
e ^ stddev = 1.583
36+
Longest mtask id = 6 from module 'top'
37+
Longest mtask time = 17.70% of time elapsed in parallelized code
38+
min log(p2e) = -4.736 from module 'top' mtask 6 (predict 30, elapsed 3420)
39+
max log(p2e) = -2.409 from module 'top' mtask 8 (predict 107, elapsed 1190)
40+
mean = -3.325
41+
stddev = 0.692
42+
e ^ stddev = 1.998
4343

4444
CPU info:
4545
Id | Time spent executing MTask | Socket | Core | Model
4646
| % of elapsed ticks / ticks | | |
4747
====|============================|========|======|======
4848
10 | 20.18% / 4725 | 0 | 10 | Test Ryzen 9 3950X 16-Core Processor
49-
19 | 3.29% / 770 | 0 | 3 | Test Ryzen 9 3950X 16-Core Processor
49+
19 | 20.61% / 4825 | 0 | 3 | Test Ryzen 9 3950X 16-Core Processor
5050

5151
Writing profile_exec.vcd

0 commit comments

Comments
 (0)