Skip to content

Add tests for accessing uninitialized holders #5654

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 15 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ set(PYBIND11_TEST_FILES
test_exceptions
test_factory_constructors
test_gil_scoped
test_invalid_holder_access
test_iostream
test_kwargs_and_defaults
test_local_bindings
Expand Down Expand Up @@ -556,7 +557,7 @@ set(PYBIND11_PYTEST_ARGS
# A single command to compile and run the tests
add_custom_target(
pytest
COMMAND ${PYBIND11_TEST_PREFIX_COMMAND} ${PYTHON_EXECUTABLE} -m pytest
COMMAND ${PYBIND11_TEST_PREFIX_COMMAND} ${PYTHON_EXECUTABLE} -X faulthandler -m pytest
${PYBIND11_ABS_PYTEST_FILES} ${PYBIND11_PYTEST_ARGS}
DEPENDS ${test_targets}
WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
Expand Down
14 changes: 14 additions & 0 deletions tests/pybind11_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,19 @@ const char *cpp_std() {
#endif
}

int cpp_std_num() {
return
#if defined(PYBIND11_CPP20)
20;
#elif defined(PYBIND11_CPP17)
17;
#elif defined(PYBIND11_CPP14)
14;
#else
11;
#endif
}

PYBIND11_MODULE(pybind11_tests, m, py::mod_gil_not_used()) {
m.doc() = "pybind11 test module";

Expand All @@ -88,6 +101,7 @@ PYBIND11_MODULE(pybind11_tests, m, py::mod_gil_not_used()) {
m.attr("compiler_info") = py::none();
#endif
m.attr("cpp_std") = cpp_std();
m.attr("cpp_std_num") = cpp_std_num();
m.attr("PYBIND11_INTERNALS_ID") = PYBIND11_INTERNALS_ID;
// Free threaded Python uses UINT32_MAX for immortal objects.
m.attr("PYBIND11_REFCNT_IMMORTAL") = UINT32_MAX;
Expand Down
145 changes: 145 additions & 0 deletions tests/test_invalid_holder_access.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#include "pybind11_tests.h"

#include <Python.h>
#include <memory>
#include <vector>

class VecOwnsObjs {
public:
void append(const py::object &obj) { vec.emplace_back(obj); }

void set_item(py::ssize_t i, const py::object &obj) {
if (!(i >= 0 && i < size())) {
throw std::out_of_range("Index out of range");
}
vec[py::size_t(i)] = obj;
}

py::object get_item(py::ssize_t i) const {
if (!(i >= 0 && i < size())) {
throw std::out_of_range("Index out of range");
}
return vec[py::size_t(i)];
}

py::ssize_t size() const { return py::ssize_t_cast(vec.size()); }

bool is_empty() const { return vec.empty(); }

static int tp_traverse(PyObject *self_base, visitproc visit, void *arg) {
#if PY_VERSION_HEX >= 0x03090000 // Python 3.9
Py_VISIT(Py_TYPE(self_base));
#endif
if (should_check_holder_initialization) {
auto *const instance = reinterpret_cast<py::detail::instance *>(self_base);
if (!instance->get_value_and_holder().holder_constructed()) {
// The holder has not been constructed yet. Skip the traversal to avoid
// segmentation faults.
return 0;
}
}
auto &self = py::cast<VecOwnsObjs &>(py::handle{self_base});
for (const auto &obj : self.vec) {
Py_VISIT(obj.ptr());
}
return 0;
}

static int tp_clear(PyObject *self_base) {
if (should_check_holder_initialization) {
auto *const instance = reinterpret_cast<py::detail::instance *>(self_base);
if (!instance->get_value_and_holder().holder_constructed()) {
// The holder has not been constructed yet. Skip the traversal to avoid
// segmentation faults.
return 0;
}
}
auto &self = py::cast<VecOwnsObjs &>(py::handle{self_base});
for (auto &obj : self.vec) {
Py_CLEAR(obj.ptr());
}
self.vec.clear();
return 0;
}

py::object get_state() const {
py::list state{};
for (const auto &item : vec) {
state.append(item);
}
return py::tuple(state);
}

static bool get_should_check_holder_initialization() {
return should_check_holder_initialization;
}

static void set_should_check_holder_initialization(bool value) {
should_check_holder_initialization = value;
}

static bool get_should_raise_error_on_set_state() { return should_raise_error_on_set_state; }

static void set_should_raise_error_on_set_state(bool value) {
should_raise_error_on_set_state = value;
}

static bool should_check_holder_initialization;
static bool should_raise_error_on_set_state;

private:
std::vector<py::object> vec{};
};

bool VecOwnsObjs::should_check_holder_initialization = false;
bool VecOwnsObjs::should_raise_error_on_set_state = false;

TEST_SUBMODULE(invalid_holder_access, m) {
m.doc() = "Test invalid holder access";

#if defined(PYBIND11_CPP14)
m.def("create_vector", [](const py::iterable &iterable) -> std::unique_ptr<VecOwnsObjs> {
auto vec = std::make_unique<VecOwnsObjs>();
for (const auto &item : iterable) {
vec->append(py::reinterpret_borrow<py::object>(item));
}
return vec;
});
#endif

py::class_<VecOwnsObjs>(
m, "VecOwnsObjs", py::custom_type_setup([](PyHeapTypeObject *heap_type) -> void {
auto *const type = &heap_type->ht_type;
type->tp_flags |= Py_TPFLAGS_HAVE_GC;
type->tp_traverse = &VecOwnsObjs::tp_traverse;
type->tp_clear = &VecOwnsObjs::tp_clear;
}))
.def_static("set_should_check_holder_initialization",
&VecOwnsObjs::set_should_check_holder_initialization,
py::arg("value"))
.def_static("set_should_raise_error_on_set_state",
&VecOwnsObjs::set_should_raise_error_on_set_state,
py::arg("value"))
#if defined(PYBIND11_CPP14)
.def(py::pickle([](const VecOwnsObjs &self) -> py::object { return self.get_state(); },
[](const py::object &state) -> std::unique_ptr<VecOwnsObjs> {
if (!py::isinstance<py::tuple>(state)) {
throw std::runtime_error("Invalid state");
}
auto vec = std::make_unique<VecOwnsObjs>();
if (VecOwnsObjs::get_should_raise_error_on_set_state()) {
throw std::runtime_error("raise error on set_state for testing");
}
for (const auto &item : state) {
vec->append(py::reinterpret_borrow<py::object>(item));
}
return vec;
}),
py::arg("state"))
#endif
.def("append", &VecOwnsObjs::append, py::arg("obj"))
.def("is_empty", &VecOwnsObjs::is_empty)
.def("__setitem__", &VecOwnsObjs::set_item, py::arg("i"), py::arg("obj"))
.def("__getitem__", &VecOwnsObjs::get_item, py::arg("i"))
.def("__len__", &VecOwnsObjs::size);
}
187 changes: 187 additions & 0 deletions tests/test_invalid_holder_access.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
from __future__ import annotations

import gc
import multiprocessing
import pickle
import signal
import sys
import weakref

import pytest

import env # noqa: F401
import pybind11_tests
from pybind11_tests import invalid_holder_access as m

XFAIL_REASON = "Known issues: https://github.com/pybind/pybind11/pull/5654"


try:
import multiprocessing

spawn_context = multiprocessing.get_context("spawn")
except (ImportError, ValueError):
spawn_context = None


def check_segfault(target):
"""Check if the target function causes a segmentation fault.

The check should be done in a separate process to avoid crashing the main
process with the segfault.
"""
process = spawn_context.Process(target=target)
process.start()
process.join()
rc = abs(process.exitcode)
if 128 < rc < 256:
rc -= 128
assert rc in (
0,
signal.SIGSEGV,
signal.SIGABRT,
0xC0000005, # STATUS_ACCESS_VIOLATION on Windows
)
if rc != 0:
raise SystemError(
"Segmentation Fault: The C++ compiler initializes container incorrectly."
)


def test_no_init():
with pytest.raises(TypeError, match=r"No constructor defined"):
m.VecOwnsObjs()
vec = m.VecOwnsObjs.__new__(m.VecOwnsObjs)
with pytest.raises(TypeError, match=r"No constructor defined"):
vec.__init__()


def manual_new_target():
# Repeatedly trigger allocation without initialization (raw malloc'ed) to
# detect uninitialized memory bugs.
for _ in range(32):
# The holder is a pointer variable while the C++ ctor is not called.
vec = m.VecOwnsObjs.__new__(m.VecOwnsObjs)
if vec.is_empty(): # <= this line could cause a segfault
# The C++ compiler initializes container correctly.
assert len(vec) == 0
else:
# The program is not supposed to reach here. It will abort with
# SIGSEGV on `vec.is_empty()`.
sys.exit(signal.SIGSEGV) # manually trigger SIGSEGV if not raised


# This test might succeed on some platforms with some compilers, but it is not
# guaranteed to work everywhere. It is marked as non-strict xfail.
@pytest.mark.xfail(reason=XFAIL_REASON, raises=SystemError, strict=False)
@pytest.mark.skipif(spawn_context is None, reason="spawn context not available")
@pytest.mark.skipif(
sys.platform.startswith("emscripten"),
reason="Requires multiprocessing",
)
def test_manual_new():
"""
Manually calling the constructor (__new__) of a class that has C++ STL
container will allocate memory without initializing it can cause a
segmentation fault.
"""
check_segfault(manual_new_target)


@pytest.mark.skipif(
pybind11_tests.cpp_std_num < 14,
reason="std::{unique_ptr,make_unique} not available in C++11",
)
def test_set_state_with_error_no_segfault_if_gc_checks_holder_has_initialized():
"""
The ``tp_traverse`` and ``tp_clear`` functions should check if the holder
has been initialized before traversing or clearing the C++ STL container.
"""
m.VecOwnsObjs.set_should_check_holder_initialization(True)

vec = m.create_vector((1, 2, 3, 4))

m.VecOwnsObjs.set_should_raise_error_on_set_state(False)
pickle.loads(pickle.dumps(vec))

# During the unpickling process, Python firstly allocates the object with
# the `__new__` method and then calls the `__setstate__`` method to set the
# state of the object.
#
# obj = cls.__new__(cls)
# obj.__setstate__(state)
#
# The `__init__` method is not called during unpickling.
# So, if the `__setstate__` method raises an error, the object will be in
# an uninitialized state.
m.VecOwnsObjs.set_should_raise_error_on_set_state(True)
serialized = pickle.dumps(vec)
with pytest.raises(
RuntimeError,
match="raise error on set_state for testing",
):
pickle.loads(serialized)


def unpicklable_with_error_target():
m.VecOwnsObjs.set_should_check_holder_initialization(False)
m.VecOwnsObjs.set_should_raise_error_on_set_state(True)

vec = m.create_vector((1, 2, 3, 4))
serialized = pickle.dumps(vec)

# Repeatedly trigger allocation without initialization (raw malloc'ed) to
# detect uninitialized memory bugs.
for _ in range(32):
# During the unpickling process, Python firstly allocates the object with
# the `__new__` method and then calls the `__setstate__`` method to set the
# state of the object.
#
# obj = cls.__new__(cls)
# obj.__setstate__(state)
#
# The `__init__` method is not called during unpickling.
# So, if the `__setstate__` method raises an error, the object will be in
# an uninitialized state. The GC will traverse the uninitialized C++ STL
# container and cause a segmentation fault.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please explain (like you would in documentation) why this is useful, and what exactly cannot be achieved with a normal init.

@rwgk I added a test and comments for this in the latest comment.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, this goes into a different direction?

Is the primary goal to fix this hole in the pickle support?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see we touched on this before:

Unfortunately, __getinitargs__ does not fit my use case because I do not bind the C++ ctor to Python __init__.

Well, that's a choice.

That, or we need to find a mechanism to gracefully handle exceptions in __setstate__.

Copy link
Contributor Author

@XuehaiPan XuehaiPan May 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding a public API to check whether the holder is constructed can resolve my issue.

// Maybe need to check if the object is an instance of pybind11 type
bool is_holder_constructed(PyObject *object) {
    auto *const instance = reinterpret_cast<py::detail::instance *>(self_base);
    return instance->get_value_and_holder().holder_constructed();
}

I also updated the documentation about this but accessing the pybind11::detail namespace.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The general idea sounds good, but I have doubts about adding it as pybind11:: is_holder_constructed().

To be safe enough for a public API, we'd need to guard with a check: "is this actually a pybind11 class_-wrapped type"?

Something like PyTypeObject *srctype = Py_TYPE(src.ptr()); and all_type_info(srctype) probably (see pybind11/detail/type_caster_base.h).

But in the context of custom_type_setup, tp_traverse, tp_clear, is that actually needed?

Another idea:

I see this in the tp_traverse and tp_clear implementations:

                auto &self = py::cast<OwnsPythonObjects &>(py::handle(self_base));

That could do the holder_constructed check internally, my guess is cheaply, although I could be wrong.

We might have to change the recipe to:

                auto *self = py::cast<OwnsPythonObjects *>(py::handle(self_base));

And if the holder isn't constructed, this could return nullptr.

Would that still work for your purposes?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see this in the tp_traverse and tp_clear implementations:

                auto &self = py::cast<OwnsPythonObjects &>(py::handle(self_base));

That could do the holder_constructed check internally, my guess is cheaply, although I could be wrong.

If the instance is not initialized, what behavior would be expected? Should it raise an error? I don't know if it is allowed to raise exceptions in tp_traverse and tp_clear. These two functions should always succeed, I think.

We might have to change the recipe to:

                auto *self = py::cast<OwnsPythonObjects *>(py::handle(self_base));

And if the holder isn't constructed, this could return nullptr.

This works for me! I think this would be the best solution to have minimal impact on the downstream and be easy to migrate.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This works for me! I think this would be the best solution to have minimal impact on the downstream and be easy to migrate.

Do you think you could try? — I can only offer to assist, by answering question if you run into roadblocks.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It crossed my mind: Possibly the suggested change could get very involved ("hairy"). In case you start to feel it's too much, I'd be OK to add pybind11::detail:: is_holder_constructed() as a semi-private API, without runtime guards, but a carefully written comment to explain the critical precondition: the caller is responsible for ensuring that the reinterpret_cast is valid.

try: # noqa: SIM105
pickle.loads(serialized)
except RuntimeError:
pass


# This test might succeed on some platforms with some compilers, but it is not
# guaranteed to work everywhere. It is marked as non-strict xfail.
@pytest.mark.xfail(reason=XFAIL_REASON, raises=SystemError, strict=False)
@pytest.mark.skipif(spawn_context is None, reason="spawn context not available")
@pytest.mark.skipif(
pybind11_tests.cpp_std_num < 14,
reason="std::{unique_ptr,make_unique} not available in C++11",
)
def test_set_state_with_error_will_segfault_if_gc_does_not_check_holder_has_initialized():
m.VecOwnsObjs.set_should_check_holder_initialization(False)

vec = m.create_vector((1, 2, 3, 4))

m.VecOwnsObjs.set_should_raise_error_on_set_state(False)
pickle.loads(pickle.dumps(vec))

# See comments above.
check_segfault(unpicklable_with_error_target)


@pytest.mark.skipif("env.PYPY or env.GRAALPY", reason="Cannot reliably trigger GC")
@pytest.mark.skipif(
pybind11_tests.cpp_std_num < 14,
reason="std::{unique_ptr,make_unique} not available in C++11",
)
def test_gc():
vec = m.create_vector(())
vec.append((vec, vec))

wr = weakref.ref(vec)
assert wr() is vec
del vec
for _ in range(10):
gc.collect()
assert wr() is None
Loading