Skip to content

Commit 352a893

Browse files
zdevitopytorchmergebot
authored andcommitted
Fast standalone symbolize for unwinding (#123966)
We've had issues using addr2line. On certain versions of CentOS it is on a version that has a performance regression making it very slow, and even normallly it is not that fast, taking several seconds even when parallelized for a typical memory trace dump. Folly Symbolize or LLVMSymbolize are fast but it requires PyTorch take a dependency on those libraries to do this, and given the number of environments we run stuff in, we end up hitting cases where we fallback to slow addr2line behavior. This adds a standalone symbolizer to PyTorch similar to the unwinder which has no external dependencies and is ~20x faster than addr2line for unwinding PyTorch frames. I've tested this on some memory profiling runs using all combinations of {gcc, clang} x {dwarf4, dwarf5} and it seems to do a good job at getting line numbers and function names right. It is also careful to route all reads of library data through the `CheckedLexer` object, which ensure it is not reading out of bounds of the section. Errors are routed through UnwindError so that those exceptions get caught and we produce a ?? frame rather than crash. I also added a fuzz test which gives all our symbolizer options random addresses in the process to make sure they do not crash. Differential Revision: [D56828968](https://our.internmc.facebook.com/intern/diff/D56828968) Pull Request resolved: #123966 Approved by: https://github.com/ezyang, https://github.com/aaronenyeshi
1 parent 5fb4a76 commit 352a893

23 files changed

+1595
-111
lines changed

test/profiler/test_profiler.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,12 @@
1616
import collections
1717
import gc
1818
import json
19+
import mmap
1920
import os
2021
import pickle
22+
import random
2123
import re
24+
import struct
2225
import subprocess
2326
import sys
2427
import threading
@@ -64,7 +67,9 @@
6467
from torch.testing._internal.common_device_type import skipCUDAVersionIn
6568
from torch.testing._internal.common_utils import (
6669
instantiate_parametrized_tests,
70+
IS_ARM64,
6771
IS_JETSON,
72+
IS_LINUX,
6873
IS_WINDOWS,
6974
parametrize,
7075
run_tests,
@@ -2436,6 +2441,70 @@ def test_profiler_pattern_matcher_json_report(self):
24362441
finally:
24372442
os.remove("torchtidy_report.json")
24382443

2444+
@unittest.skipIf(IS_ARM64 or not IS_LINUX, "x86 linux only cpp unwinding")
2445+
def test_fuzz_symbolize(self):
2446+
# generate some random addresses in the text section and make sure the
2447+
# symbolizers do not throw exceptions/crash
2448+
def get_text_sections():
2449+
text_sections = []
2450+
seen = set()
2451+
for filename in os.listdir("/proc/self/map_files"):
2452+
library = os.readlink("/proc/self/map_files/" + filename)
2453+
if ".so" not in library or library in seen:
2454+
continue
2455+
seen.add(library)
2456+
with open(os.path.join("/proc/self/map_files", library), "rb") as f:
2457+
mm = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
2458+
2459+
def unpack(fmt, offset):
2460+
return struct.unpack(
2461+
fmt, mm[offset : offset + struct.calcsize(fmt)]
2462+
)
2463+
2464+
if mm[:4] != b"\x7fELF":
2465+
continue
2466+
(section_headers_start,) = unpack("Q", 40)
2467+
(section_header_size,) = unpack("H", 58)
2468+
(num_section_headers,) = unpack("H", 60)
2469+
(shstrndx,) = unpack("H", 62)
2470+
(shstrtab_offset,) = unpack(
2471+
"Q", section_headers_start + shstrndx * section_header_size + 24
2472+
)
2473+
for i in range(num_section_headers):
2474+
(section_name_offset,) = unpack(
2475+
"I", section_headers_start + i * section_header_size
2476+
)
2477+
name_start = shstrtab_offset + section_name_offset
2478+
section_name = mm[name_start : name_start + 6]
2479+
if section_name != b".text\0":
2480+
continue
2481+
(section_offset,) = unpack(
2482+
"Q", section_headers_start + i * section_header_size + 24
2483+
)
2484+
(section_size,) = unpack(
2485+
"Q", section_headers_start + i * section_header_size + 32
2486+
)
2487+
start = int(filename.split("-")[0], 16) + section_offset
2488+
text_sections.append((start, section_size))
2489+
break
2490+
mm.close()
2491+
return text_sections
2492+
2493+
r = random.Random()
2494+
r.seed(1)
2495+
text_sections = get_text_sections()
2496+
addrs = []
2497+
for i in range(200):
2498+
s = r.randrange(0, len(text_sections))
2499+
start, size = text_sections[s]
2500+
addr = r.randrange(start, start + size)
2501+
addrs.append(addr)
2502+
fast = torch._C._profiler.symbolize_addresses(addrs, "fast")
2503+
dladdr = torch._C._profiler.symbolize_addresses(addrs, "dladdr")
2504+
addr2line = torch._C._profiler.symbolize_addresses(addrs, "addr2line")
2505+
self.assertEqual(len(fast), len(addrs))
2506+
self.assertEqual(len(addr2line), len(fast))
2507+
24392508

24402509
if __name__ == "__main__":
24412510
run_tests()

torch/csrc/Module.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -168,12 +168,14 @@ static PyObject* THPModule_initExtension(
168168
PyObject* shm_manager_path) {
169169
HANDLE_TH_ERRORS
170170
#if !defined(FBCODE_CAFFE2)
171-
if (torch::get_cpp_stacktraces_enabled() && !torch::get_disable_addr2line()) {
171+
if (torch::get_cpp_stacktraces_enabled()) {
172172
c10::SetStackTraceFetcher([]() -> std::string {
173173
auto tb = torch::CapturedTraceback::gather(false, false, true);
174-
LOG(WARNING)
175-
<< "symbolizing C++ stack trace for exception; if this hangs, rerun with TORCH_DISABLE_ADDR2LINE=1..."
176-
<< std::endl;
174+
if (torch::get_symbolize_mode() == torch::unwind::Mode::addr2line) {
175+
LOG(WARNING)
176+
<< "symbolizing C++ stack trace for exception; if this hangs, rerun with TORCH_DISABLE_ADDR2LINE=1..."
177+
<< std::endl;
178+
}
177179
auto s_tbs = torch::symbolize({tb.get()});
178180
std::stringstream oss;
179181
oss << "C++ CapturedTraceback:" << std::endl;

torch/csrc/profiler/combined_traceback.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include <torch/csrc/profiler/combined_traceback.h>
2+
#include <torch/csrc/utils/cpp_stacktraces.h>
23

34
namespace torch {
45

@@ -77,7 +78,7 @@ SymbolizedTracebacks symbolize(
7778
}
7879
// gather symbol names for C++ frames
7980
if (!all_cpp_ips.empty()) {
80-
r.all_frames = unwind::symbolize(all_cpp_ips);
81+
r.all_frames = unwind::symbolize(all_cpp_ips, torch::get_symbolize_mode());
8182
}
8283

8384
// batch symbolization requests so we dedup frame objects

torch/csrc/profiler/python/init.cpp

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,7 @@ PyTypeObject THPCapturedTracebackType = {
7979
nullptr, /* tp_new */
8080
};
8181

82-
namespace pybind11 {
83-
namespace detail {
82+
namespace pybind11::detail {
8483

8584
template <>
8685
struct type_caster<std::shared_ptr<torch::CapturedTraceback>> {
@@ -107,11 +106,9 @@ struct type_caster<std::shared_ptr<torch::CapturedTraceback>> {
107106
}
108107
};
109108

110-
} // namespace detail
111-
} // namespace pybind11
109+
} // namespace pybind11::detail
112110

113-
namespace torch {
114-
namespace profiler {
111+
namespace torch::profiler {
115112

116113
/* [NOTE: RecordFunctionFast]
117114
* This is an alternate way to call record_function from python.
@@ -606,6 +603,33 @@ void initPythonBindings(PyObject* module) {
606603
}
607604
return py_symbolize(tb_ptrs);
608605
});
606+
// directly convert address pointers to frames, used for testing symbolize
607+
m.def(
608+
"symbolize_addresses",
609+
[](const std::vector<uint64_t>& frames, const std::string& mode_s) {
610+
std::vector<std::tuple<std::string, int64_t, std::string>> frames_out;
611+
torch::unwind::Mode mode = torch::unwind::Mode::addr2line;
612+
if (mode_s == "fast") {
613+
mode = torch::unwind::Mode::fast;
614+
} else if (mode_s == "addr2line") {
615+
mode = torch::unwind::Mode::addr2line;
616+
} else if (mode_s == "dladdr") {
617+
mode = torch::unwind::Mode::dladdr;
618+
} else {
619+
TORCH_CHECK(false, "unexpected mode ", mode_s);
620+
}
621+
std::vector<void*> frames_p;
622+
frames_p.reserve(frames.size());
623+
for (auto f : frames) {
624+
frames_p.push_back((void*)f); // NOLINT
625+
}
626+
auto frame_objects = unwind::symbolize(frames_p, mode);
627+
frames_out.reserve(frame_objects.size());
628+
for (auto& frame : frame_objects) {
629+
frames_out.emplace_back(frame.filename, frame.lineno, frame.funcname);
630+
}
631+
return frames_out;
632+
});
609633
installCapturedTracebackPython();
610634

611635
// NOLINTNEXTLINE(*-c-arrays*)
@@ -639,5 +663,4 @@ void initPythonBindings(PyObject* module) {
639663
throw python_error();
640664
}
641665
}
642-
} // namespace profiler
643-
} // namespace torch
666+
} // namespace torch::profiler

torch/csrc/profiler/unwind/action.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
#include <stdint.h>
33
#include <ostream>
44

5+
namespace torch::unwind {
6+
57
enum {
68
A_UNDEFINED = 0x0,
79
A_REG_PLUS_DATA = 0x1, // exp = REG[reg] + data0
@@ -53,3 +55,5 @@ struct Action {
5355
return out;
5456
}
5557
};
58+
59+
} // namespace torch::unwind

torch/csrc/profiler/unwind/communicate.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <unistd.h>
66
#include <memory>
77

8+
namespace torch::unwind {
89
// helper to open a process with stdin/stdout/stderr streams.
910
struct Communicate {
1011
Communicate(const char* command, const char** args) {
@@ -63,3 +64,5 @@ struct Communicate {
6364
std::unique_ptr<std::ostream> out_;
6465
std::unique_ptr<std::ostream> err_;
6566
};
67+
68+
} // namespace torch::unwind

0 commit comments

Comments
 (0)