Skip to content

Commit 124b479

Browse files
authored
Make gzip reproducible (#560)
1 parent 0e61a20 commit 124b479

File tree

2 files changed

+20
-8
lines changed

2 files changed

+20
-8
lines changed

ord_schema/message_helpers.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
"""Helper functions for constructing Protocol Buffer messages."""
1515

1616
import enum
17+
import functools
1718
import gzip
1819
import os
1920
import re
@@ -770,21 +771,19 @@ def write_message(message: ord_schema.Message, filename: str):
770771
ValueError: if `filename` does not have the expected suffix.
771772
"""
772773
if filename.endswith('.gz'):
773-
this_open = gzip.open
774+
# NOTE(kearnes): Set a constant mtime so that round-trips through gzip
775+
# result in identical files.
776+
this_open = functools.partial(gzip.GzipFile, mtime=1)
774777
_, extension = os.path.splitext('.'.join(filename.split('.')[:-1]))
775778
else:
776779
this_open = open
777780
_, extension = os.path.splitext(filename)
778781
output_format = MessageFormat(extension)
779-
if output_format == MessageFormat.BINARY:
780-
mode = 'wb'
781-
else:
782-
mode = 'wt'
783-
with this_open(filename, mode) as f:
782+
with this_open(filename, 'wb') as f:
784783
if output_format == MessageFormat.JSON:
785-
f.write(json_format.MessageToJson(message))
784+
f.write(json_format.MessageToJson(message).encode())
786785
elif output_format == MessageFormat.PBTXT:
787-
f.write(text_format.MessageToString(message))
786+
f.write(text_format.MessageToBytes(message))
788787
elif output_format == MessageFormat.BINARY:
789788
f.write(message.SerializeToString(deterministic=True))
790789

ord_schema/message_helpers_test.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
import os
1717
import tempfile
18+
import time
1819

1920
from absl import flags
2021
from absl.testing import absltest
@@ -429,6 +430,7 @@ class LoadAndWriteMessageTest(parameterized.TestCase, absltest.TestCase):
429430

430431
def setUp(self):
431432
super().setUp()
433+
self.test_directory = self.create_tempdir()
432434
self.messages = [
433435
test_pb2.Scalar(int32_value=3, float_value=4.5),
434436
test_pb2.RepeatedScalar(values=[1.2, 3.4]),
@@ -448,6 +450,17 @@ def test_round_trip(self, suffix):
448450
message,
449451
message_helpers.load_message(f.name, type(message)))
450452

453+
def test_gzip_reproducibility(self):
454+
filename = os.path.join(self.test_directory, 'test.pb.gz')
455+
for message in self.messages:
456+
message_helpers.write_message(message, filename)
457+
with open(filename, 'rb') as f:
458+
value = f.read()
459+
time.sleep(1)
460+
message_helpers.write_message(message, filename)
461+
with open(filename, 'rb') as f:
462+
self.assertEqual(f.read(), value)
463+
451464
def test_bad_binary(self):
452465
with tempfile.NamedTemporaryFile(suffix='.pb') as f:
453466
message = test_pb2.RepeatedScalar(values=[1.2, 3.4])

0 commit comments

Comments
 (0)