Skip to content

Commit

Permalink
Fast standalone symbolize for unwinding (#123966)
Browse files Browse the repository at this point in the history
We've had issues using addr2line. On certain versions of
CentOS it is on a version that has a performance regression making it very slow,
and even normallly it is not that fast, taking several seconds even when parallelized
for a typical memory trace dump.

Folly Symbolize or LLVMSymbolize are fast but it requires PyTorch take a dependency on those libraries to do this, and given the number of environments we run stuff in, we end up hitting cases where we fallback to slow addr2line behavior.

This adds a standalone symbolizer to PyTorch similar to the unwinder which has
no external dependencies and is ~20x faster than addr2line for unwinding PyTorch frames.

I've tested this on some memory profiling runs using all combinations of {gcc, clang} x {dwarf4, dwarf5} and it seems to do a good job at getting line numbers and function names right. It is also careful to route all reads of library data through the `CheckedLexer` object, which ensure it is not reading out of bounds of the section. Errors are routed through UnwindError so that those exceptions get caught and we produce a ?? frame rather than crash. I also added a fuzz test which gives all our symbolizer options random addresses in the process to make sure they do not crash.

Differential Revision: [D56828968](https://our.internmc.facebook.com/intern/diff/D56828968)
Pull Request resolved: #123966
Approved by: https://github.com/ezyang, https://github.com/aaronenyeshi
  • Loading branch information
zdevito authored and pytorchmergebot committed May 14, 2024
1 parent 5fb4a76 commit 352a893
Show file tree
Hide file tree
Showing 23 changed files with 1,595 additions and 111 deletions.
69 changes: 69 additions & 0 deletions test/profiler/test_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,12 @@
import collections
import gc
import json
import mmap
import os
import pickle
import random
import re
import struct
import subprocess
import sys
import threading
Expand Down Expand Up @@ -64,7 +67,9 @@
from torch.testing._internal.common_device_type import skipCUDAVersionIn
from torch.testing._internal.common_utils import (
instantiate_parametrized_tests,
IS_ARM64,
IS_JETSON,
IS_LINUX,
IS_WINDOWS,
parametrize,
run_tests,
Expand Down Expand Up @@ -2436,6 +2441,70 @@ def test_profiler_pattern_matcher_json_report(self):
finally:
os.remove("torchtidy_report.json")

@unittest.skipIf(IS_ARM64 or not IS_LINUX, "x86 linux only cpp unwinding")
def test_fuzz_symbolize(self):
# generate some random addresses in the text section and make sure the
# symbolizers do not throw exceptions/crash
def get_text_sections():
text_sections = []
seen = set()
for filename in os.listdir("/proc/self/map_files"):
library = os.readlink("/proc/self/map_files/" + filename)
if ".so" not in library or library in seen:
continue
seen.add(library)
with open(os.path.join("/proc/self/map_files", library), "rb") as f:
mm = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)

def unpack(fmt, offset):
return struct.unpack(
fmt, mm[offset : offset + struct.calcsize(fmt)]
)

if mm[:4] != b"\x7fELF":
continue
(section_headers_start,) = unpack("Q", 40)
(section_header_size,) = unpack("H", 58)
(num_section_headers,) = unpack("H", 60)
(shstrndx,) = unpack("H", 62)
(shstrtab_offset,) = unpack(
"Q", section_headers_start + shstrndx * section_header_size + 24
)
for i in range(num_section_headers):
(section_name_offset,) = unpack(
"I", section_headers_start + i * section_header_size
)
name_start = shstrtab_offset + section_name_offset
section_name = mm[name_start : name_start + 6]
if section_name != b".text\0":
continue
(section_offset,) = unpack(
"Q", section_headers_start + i * section_header_size + 24
)
(section_size,) = unpack(
"Q", section_headers_start + i * section_header_size + 32
)
start = int(filename.split("-")[0], 16) + section_offset
text_sections.append((start, section_size))
break
mm.close()
return text_sections

r = random.Random()
r.seed(1)
text_sections = get_text_sections()
addrs = []
for i in range(200):
s = r.randrange(0, len(text_sections))
start, size = text_sections[s]
addr = r.randrange(start, start + size)
addrs.append(addr)
fast = torch._C._profiler.symbolize_addresses(addrs, "fast")
dladdr = torch._C._profiler.symbolize_addresses(addrs, "dladdr")
addr2line = torch._C._profiler.symbolize_addresses(addrs, "addr2line")
self.assertEqual(len(fast), len(addrs))
self.assertEqual(len(addr2line), len(fast))


if __name__ == "__main__":
run_tests()
10 changes: 6 additions & 4 deletions torch/csrc/Module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,12 +168,14 @@ static PyObject* THPModule_initExtension(
PyObject* shm_manager_path) {
HANDLE_TH_ERRORS
#if !defined(FBCODE_CAFFE2)
if (torch::get_cpp_stacktraces_enabled() && !torch::get_disable_addr2line()) {
if (torch::get_cpp_stacktraces_enabled()) {
c10::SetStackTraceFetcher([]() -> std::string {
auto tb = torch::CapturedTraceback::gather(false, false, true);
LOG(WARNING)
<< "symbolizing C++ stack trace for exception; if this hangs, rerun with TORCH_DISABLE_ADDR2LINE=1..."
<< std::endl;
if (torch::get_symbolize_mode() == torch::unwind::Mode::addr2line) {
LOG(WARNING)
<< "symbolizing C++ stack trace for exception; if this hangs, rerun with TORCH_DISABLE_ADDR2LINE=1..."
<< std::endl;
}
auto s_tbs = torch::symbolize({tb.get()});
std::stringstream oss;
oss << "C++ CapturedTraceback:" << std::endl;
Expand Down
3 changes: 2 additions & 1 deletion torch/csrc/profiler/combined_traceback.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <torch/csrc/profiler/combined_traceback.h>
#include <torch/csrc/utils/cpp_stacktraces.h>

namespace torch {

Expand Down Expand Up @@ -77,7 +78,7 @@ SymbolizedTracebacks symbolize(
}
// gather symbol names for C++ frames
if (!all_cpp_ips.empty()) {
r.all_frames = unwind::symbolize(all_cpp_ips);
r.all_frames = unwind::symbolize(all_cpp_ips, torch::get_symbolize_mode());
}

// batch symbolization requests so we dedup frame objects
Expand Down
39 changes: 31 additions & 8 deletions torch/csrc/profiler/python/init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,7 @@ PyTypeObject THPCapturedTracebackType = {
nullptr, /* tp_new */
};

namespace pybind11 {
namespace detail {
namespace pybind11::detail {

template <>
struct type_caster<std::shared_ptr<torch::CapturedTraceback>> {
Expand All @@ -107,11 +106,9 @@ struct type_caster<std::shared_ptr<torch::CapturedTraceback>> {
}
};

} // namespace detail
} // namespace pybind11
} // namespace pybind11::detail

namespace torch {
namespace profiler {
namespace torch::profiler {

/* [NOTE: RecordFunctionFast]
* This is an alternate way to call record_function from python.
Expand Down Expand Up @@ -606,6 +603,33 @@ void initPythonBindings(PyObject* module) {
}
return py_symbolize(tb_ptrs);
});
// directly convert address pointers to frames, used for testing symbolize
m.def(
"symbolize_addresses",
[](const std::vector<uint64_t>& frames, const std::string& mode_s) {
std::vector<std::tuple<std::string, int64_t, std::string>> frames_out;
torch::unwind::Mode mode = torch::unwind::Mode::addr2line;
if (mode_s == "fast") {
mode = torch::unwind::Mode::fast;
} else if (mode_s == "addr2line") {
mode = torch::unwind::Mode::addr2line;
} else if (mode_s == "dladdr") {
mode = torch::unwind::Mode::dladdr;
} else {
TORCH_CHECK(false, "unexpected mode ", mode_s);
}
std::vector<void*> frames_p;
frames_p.reserve(frames.size());
for (auto f : frames) {
frames_p.push_back((void*)f); // NOLINT
}
auto frame_objects = unwind::symbolize(frames_p, mode);
frames_out.reserve(frame_objects.size());
for (auto& frame : frame_objects) {
frames_out.emplace_back(frame.filename, frame.lineno, frame.funcname);
}
return frames_out;
});
installCapturedTracebackPython();

// NOLINTNEXTLINE(*-c-arrays*)
Expand Down Expand Up @@ -639,5 +663,4 @@ void initPythonBindings(PyObject* module) {
throw python_error();
}
}
} // namespace profiler
} // namespace torch
} // namespace torch::profiler
4 changes: 4 additions & 0 deletions torch/csrc/profiler/unwind/action.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#include <stdint.h>
#include <ostream>

namespace torch::unwind {

enum {
A_UNDEFINED = 0x0,
A_REG_PLUS_DATA = 0x1, // exp = REG[reg] + data0
Expand Down Expand Up @@ -53,3 +55,5 @@ struct Action {
return out;
}
};

} // namespace torch::unwind
3 changes: 3 additions & 0 deletions torch/csrc/profiler/unwind/communicate.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <unistd.h>
#include <memory>

namespace torch::unwind {
// helper to open a process with stdin/stdout/stderr streams.
struct Communicate {
Communicate(const char* command, const char** args) {
Expand Down Expand Up @@ -63,3 +64,5 @@ struct Communicate {
std::unique_ptr<std::ostream> out_;
std::unique_ptr<std::ostream> err_;
};

} // namespace torch::unwind

0 comments on commit 352a893

Please sign in to comment.