Skip to content

Commit

Permalink
Traverse directories in stable order when building a PEX (#2220)
Browse files Browse the repository at this point in the history
Addresses one part of #2155.

Adds a wrapper for `os.walk` that sorts directory entries as it
traverses directories. As discussed in the issue, `os.walk` may yield
directory entries in a different order depending on (possibly) the file
system.

I don't know if the two `os.walk` calls that this PR replaces are enough
for addressing this class of non-determinism in pex, but these were
sufficient for solving the issue in my case. Happy to expand the use of
the wrapper if there are other places that it should be used.
  • Loading branch information
ento authored Aug 21, 2023
1 parent 45eea4b commit c6fba6d
Show file tree
Hide file tree
Showing 5 changed files with 154 additions and 4 deletions.
29 changes: 27 additions & 2 deletions pex/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
Dict,
Iterable,
Iterator,
List,
NoReturn,
Optional,
Set,
Expand Down Expand Up @@ -236,6 +237,30 @@ def open_zip(path, *args, **kwargs):
yield zip


def deterministic_walk(*args, **kwargs):
# type: (*Any, **Any) -> Iterator[Tuple[str, List[str], List[str]]]
"""Walk the specified directory tree in deterministic order.
Takes the same parameters as os.walk and yields tuples of the same shape,
except for the `topdown` parameter, which must always be true.
`deterministic_walk` is essentially a wrapper of os.walk, and os.walk doesn't
allow modifying the order of the walk when called with `topdown` set to false.
os.walk uses os.listdir or os.scandir, depending on the Python version,
both of which don't guarantee the order in which directory entries get listed.
So when the build output depends on the order of directory traversal,
use deterministic_walk instead.
"""
# when topdown is false, modifying ``dirs`` has no effect
assert kwargs.get("topdown", True), "Determinism cannot be guaranteed when ``topdown`` is false"
for root, dirs, files in os.walk(*args, **kwargs):
dirs.sort()
files.sort()
yield root, dirs, files
# make sure ``dirs`` is sorted after any modifications
dirs.sort()


@contextlib.contextmanager
def temporary_dir(cleanup=True):
# type: (bool) -> Iterator[str]
Expand Down Expand Up @@ -675,8 +700,8 @@ def iter_files():
yield full_path, path
continue

for root, _, files in os.walk(full_path):
for f in sorted(files):
for root, _, files in deterministic_walk(full_path):
for f in files:
if exclude_file(f):
continue
abs_path = os.path.join(root, f)
Expand Down
3 changes: 2 additions & 1 deletion pex/pex_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from pex.common import (
Chroot,
chmod_plus_x,
deterministic_walk,
filter_pyc_files,
is_pyc_temporary_file,
safe_copy,
Expand Down Expand Up @@ -586,7 +587,7 @@ def _prepare_bootstrap(self):
bootstrap_packages = ["third_party", "venv"]
if self._pex_info.includes_tools:
bootstrap_packages.extend(["commands", "tools"])
for root, dirs, files in os.walk(_ABS_PEX_PACKAGE_DIR):
for root, dirs, files in deterministic_walk(_ABS_PEX_PACKAGE_DIR):
if root == _ABS_PEX_PACKAGE_DIR:
dirs[:] = bootstrap_packages

Expand Down
40 changes: 40 additions & 0 deletions testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import random
import subprocess
import sys
from collections import Counter
from contextlib import contextmanager
from textwrap import dedent

Expand All @@ -31,6 +32,11 @@
from pex.util import named_temporary_file
from pex.venv.virtualenv import Virtualenv

try:
from unittest import mock
except ImportError:
import mock # type: ignore[no-redef,import]

if TYPE_CHECKING:
from typing import (
Any,
Expand Down Expand Up @@ -759,3 +765,37 @@ def pex_project_dir():
return os.environ["_PEX_TEST_PROJECT_DIR"]
except KeyError:
sys.exit("Pex tests must be run via tox.")


class NonDeterministicWalk:
"""A wrapper around `os.walk` that makes it non-deterministic.
Makes sure that directories and files are always returned in a different
order each time it is called.
Typically used like: `unittest.mock.patch("os.walk", new=NonDeterministicWalk())`
"""

def __init__(self):
self._counter = Counter() # type: Counter[str, int]
self._original_walk = os.walk

def __call__(self, *args, **kwargs):
# type: (*Any, **Any) -> Iterator[Tuple[str, List[str], List[str]]]
for root, dirs, files in self._original_walk(*args, **kwargs):
self._increment_counter(root)
dirs[:] = self._rotate(root, dirs)
files[:] = self._rotate(root, files)
yield root, dirs, files

def _increment_counter(self, counter_key):
# type: (str) -> int
self._counter[counter_key] += 1
return self._counter[counter_key]

def _rotate(self, counter_key, x):
# type: (str, List[str]) -> List[str]
if not x:
return x
rotate_by = self._counter[counter_key] % len(x)
return x[-rotate_by:] + x[:-rotate_by]
62 changes: 62 additions & 0 deletions tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
PermPreservingZipFile,
can_write_dir,
chmod_plus_x,
deterministic_walk,
is_exe,
is_script,
open_zip,
Expand All @@ -20,6 +21,7 @@
touch,
)
from pex.typing import TYPE_CHECKING
from testing import NonDeterministicWalk

try:
from unittest import mock
Expand Down Expand Up @@ -140,6 +142,36 @@ def test_chroot_zip():
assert b"data" == zip.read("directory/subdirectory/file")


def test_chroot_zip_is_deterministic():
# type: () -> None
with temporary_dir() as tmp:
root_dir = os.path.join(tmp, "root")
dir_a = os.path.join(root_dir, "a")
src_path_a = os.path.join(dir_a, "file_a")
touch(src_path_a)
dir_b = os.path.join(root_dir, "b")
src_path_b = os.path.join(dir_b, "file_b")
touch(src_path_b)

chroot = Chroot(os.path.join(tmp, "chroot"))
chroot.symlink(root_dir, "root")

zip_one_dst = os.path.join(tmp, "chroot_one.zip")
zip_two_dst = os.path.join(tmp, "chroot_two.zip")

with mock.patch("os.walk", new=NonDeterministicWalk()):
chroot.zip(zip_one_dst)
chroot.zip(zip_two_dst)

with open_zip(zip_one_dst) as zip_file:
namelist_one = zip_file.namelist()

with open_zip(zip_two_dst) as zip_file:
namelist_two = zip_file.namelist()

assert namelist_one == namelist_two


def test_chroot_zip_symlink():
# type: () -> None
with temporary_dir() as tmp:
Expand Down Expand Up @@ -189,6 +221,36 @@ def test_chroot_zip_symlink():
assert b"data" == zip.read("symlinked/subdirectory/symlinked")


def test_deterministic_walk():
# type: () -> None
with temporary_dir() as tmp:
root_dir = os.path.join(tmp, "root")
dir_a = os.path.join(root_dir, "a")
file_a = os.path.join(dir_a, "file_a")
touch(file_a)
dir_b = os.path.join(root_dir, "b")
file_b = os.path.join(dir_b, "file_b")
touch(file_b)

with mock.patch("os.walk", new=NonDeterministicWalk()):
result_a = []
for root, dirs, files in deterministic_walk(root_dir):
result_a.append((root, dirs, files))
if dirs:
dirs[:] = ["b", "a"]

result_b = []
for root, dirs, files in deterministic_walk(root_dir):
result_b.append((root, dirs, files))

assert result_a == [
(root_dir, ["a", "b"], []),
(dir_a, [], ["file_a"]),
(dir_b, [], ["file_b"]),
], "Modifying dirs should not affect the order of the walk"
assert result_a == result_b, "Should be resilient to os.walk yielding in arbitrary order"


def test_can_write_dir_writeable_perms():
# type: () -> None
with temporary_dir() as writeable:
Expand Down
24 changes: 23 additions & 1 deletion tests/test_pex_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,14 @@
from pex.pex_builder import CopyMode, PEXBuilder
from pex.typing import TYPE_CHECKING
from pex.variables import ENV
from testing import WheelBuilder, install_wheel, make_bdist, make_env
from testing import NonDeterministicWalk, WheelBuilder, install_wheel, make_bdist, make_env
from testing import write_simple_pex as write_pex

try:
from unittest import mock
except ImportError:
import mock # type: ignore[no-redef,import]

if TYPE_CHECKING:
from typing import Any, Iterator, List, Set

Expand Down Expand Up @@ -372,6 +377,23 @@ def test_pex_builder_exclude_bootstrap_testing(
], "Expected testing support files to be stripped from the Pex `.bootstrap`."


def test_pex_builder_deterministic_pex_info(
tmpdir, # type: Any
):
# type: (...) -> None

pex_path = os.path.join(str(tmpdir), "empty.pex")

with mock.patch("os.walk", new=NonDeterministicWalk()):
pb_one = PEXBuilder()
pb_one.build(pex_path)

pb_two = PEXBuilder()
pb_two.build(pex_path)

assert pb_one.info.as_json_dict() == pb_two.info.as_json_dict()


@pytest.mark.parametrize(
"strip_pex_env", [pytest.param(True, id="StripPexEnv"), pytest.param(False, id="NoStripPexEnv")]
)
Expand Down

0 comments on commit c6fba6d

Please sign in to comment.