Add support for json[l].gz, and make ray dataset support reading json[l].gz json[l].zst format

ext.wanghao204 · HunterLine · commit ee296ea479f0 · 2026-03-05T19:58:32.000+08:00
diff --git a/data_juicer/core/data/load_strategy.py b/data_juicer/core/data/load_strategy.py
@@ -631,7 +631,7 @@ def load_data(self, **kwargs):
 
             # Use ray.data functions directly with PyArrow filesystem support
             # Ray's read functions support filesystem parameter via PyArrow
-            if data_format in {"json", "jsonl"}:
+            if data_format in {"json", "jsonl", "json.gz", "jsonl.gz", "json.zst", "jsonl.zst"}:
                 # For JSON, we need to use read_json_stream with filesystem
                 from data_juicer.core.data.ray_dataset import read_json_stream
 
diff --git a/data_juicer/core/data/ray_dataset.py b/data_juicer/core/data/ray_dataset.py
@@ -355,7 +355,7 @@ def count(self) -> int:
 
     @classmethod
     def read(cls, data_format: str, paths: Union[str, List[str]]) -> RayDataset:
-        if data_format in {"json", "jsonl"}:
+        if data_format in {"json", "jsonl", "json.gz", "jsonl.gz", "json.zst", "jsonl.zst"}:
             return RayDataset.read_json(paths)
         elif data_format == "webdataset":
             return RayDataset.read_webdataset(paths)
@@ -453,7 +453,7 @@ def read_json_stream(
     include_paths: bool = False,
     ignore_missing_paths: bool = False,
     shuffle: Union[Literal["files"], None] = None,
-    file_extensions: Optional[List[str]] = ["json", "jsonl"],
+    file_extensions: Optional[List[str]] = ["json", "jsonl", "json.gz", "jsonl.gz", "json.zst", "jsonl.zst"],
     concurrency: Optional[int] = None,
     override_num_blocks: Optional[int] = None,
     **arrow_json_args,
diff --git a/data_juicer/format/json_formatter.py b/data_juicer/format/json_formatter.py
@@ -9,7 +9,7 @@ class JsonFormatter(LocalFormatter):
     Default suffixes is `['.json', '.jsonl', '.jsonl.zst']`
     """
 
-    SUFFIXES = [".json", ".jsonl", ".jsonl.zst"]
+    SUFFIXES = [".json", ".jsonl", "json.gz", "jsonl.gz", "json.zst", "jsonl.zst"]
 
     def __init__(self, dataset_path, suffixes=None, **kwargs):
         """
diff --git a/data_juicer/utils/file_utils.py b/data_juicer/utils/file_utils.py
@@ -11,6 +11,7 @@
 
 import aiohttp
 import pandas as pd
+from datasets.utils.extract import GzipExtractor
 from datasets.utils.extract import ZstdExtractor as Extractor
 
 from data_juicer.utils.common_utils import dict_to_hash
@@ -112,6 +113,12 @@ def find_files_with_suffix(
             # just like '.jsonl.zst'
             file_suffixes = [suffix.lower() for suffix in file.suffixes]
             suffix = "".join(file_suffixes[-2:])
+        elif GzipExtractor.is_extractable(file):
+            # support gzip-format file
+            # and use the last 2 sub-suffixes as the final suffix
+            # just like '.jsonl.gz'
+            file_suffixes = [suffix.lower() for suffix in file.suffixes]
+            suffix = "".join(file_suffixes[-2:])
 
         if not suffixes or (suffix in suffixes):
             if suffix not in file_dict:
diff --git a/tests/format/test_json_formatter.py b/tests/format/test_json_formatter.py
@@ -1,49 +1,117 @@
 import os
 import unittest
+import gzip
+import tempfile
+import shutil
 
 from data_juicer.format.json_formatter import JsonFormatter
 from data_juicer.format.load import load_formatter
 from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
+try:
+    import zstandard as zstd  # type: ignore
+
+    HAS_ZSTD = True
+except Exception:
+    zstd = None
+    HAS_ZSTD = False
+
 
 class JsonFormatterTest(DataJuicerTestCaseBase):
 
     def setUp(self):
         super().setUp()
 
-        self._path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
-                                  'data', 'structured')
-        self._file = os.path.join(self._path, 'demo-dataset.jsonl')
+        self._path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "structured")
+        self._file = os.path.join(self._path, "demo-dataset.jsonl")
         print(self._file)
+        # create compressed variants for testing
+        # create a temp directory to hold generated compressed files
+        self._temp_dir = tempfile.mkdtemp()
+        with open(self._file, "rb") as f:
+            raw = f.read()
+
+        # .jsonl.gz
+        self._jsonl_gz = os.path.join(self._temp_dir, "demo-dataset.jsonl.gz")
+        with gzip.open(self._jsonl_gz, "wb") as f:
+            f.write(raw)
+
+        # .json.gz (same content, different suffix)
+        self._json_gz = os.path.join(self._temp_dir, "demo-dataset.json.gz")
+        with gzip.open(self._json_gz, "wb") as f:
+            f.write(raw)
+
+        # .json.zst and .jsonl.zst if zstandard available
+        if HAS_ZSTD:
+            self._jsonl_zst = os.path.join(self._temp_dir, "demo-dataset.jsonl.zst")
+            self._json_zst = os.path.join(self._temp_dir, "demo-dataset.json.zst")
+            cctx = zstd.ZstdCompressor()
+            compressed = cctx.compress(raw)
+            with open(self._jsonl_zst, "wb") as f:
+                f.write(compressed)
+            with open(self._json_zst, "wb") as f:
+                f.write(compressed)
 
     def test_json_file(self):
         formatter = JsonFormatter(self._file)
         ds = formatter.load_dataset()
         self.assertEqual(len(ds), 6)
-        self.assertEqual(list(ds.features.keys()), ['text', 'meta'])
+        self.assertEqual(list(ds.features.keys()), ["text", "meta"])
 
     def test_json_path(self):
         formatter = JsonFormatter(self._path)
         ds = formatter.load_dataset()
         self.assertEqual(len(ds), 6)
-        self.assertEqual(list(ds.features.keys()), ['text', 'meta'])
+        self.assertEqual(list(ds.features.keys()), ["text", "meta"])
 
     def test_load_formatter_with_file(self):
         """Test load_formatter with a direct file path"""
         formatter = load_formatter(self._file)
         self.assertIsInstance(formatter, JsonFormatter)
         ds = formatter.load_dataset()
         self.assertEqual(len(ds), 6)
-        self.assertEqual(list(ds.features.keys()), ['text', 'meta'])
+        self.assertEqual(list(ds.features.keys()), ["text", "meta"])
 
     def test_load_formatter_with_specified_suffix(self):
         """Test load_formatter with specified suffixes"""
-        formatter = load_formatter(self._path, suffixes=['.jsonl'])
+        formatter = load_formatter(self._path, suffixes=[".jsonl"])
         self.assertIsInstance(formatter, JsonFormatter)
         ds = formatter.load_dataset()
         self.assertEqual(len(ds), 6)
-        self.assertEqual(list(ds.features.keys()), ['text', 'meta'])
+        self.assertEqual(list(ds.features.keys()), ["text", "meta"])
+
+    def tearDown(self):
+        # cleanup temp dir and files
+        if hasattr(self, "_temp_dir") and os.path.exists(self._temp_dir):
+            shutil.rmtree(self._temp_dir)
+        super().tearDown()
+
+    def test_jsonl_gz_file(self):
+        formatter = JsonFormatter(self._jsonl_gz)
+        ds = formatter.load_dataset()
+        self.assertEqual(len(ds), 6)
+        self.assertEqual(list(ds.features.keys()), ["text", "meta"])
+
+    def test_json_gz_file(self):
+        formatter = JsonFormatter(self._json_gz)
+        ds = formatter.load_dataset()
+        self.assertEqual(len(ds), 6)
+        self.assertEqual(list(ds.features.keys()), ["text", "meta"])
+
+    @unittest.skipUnless(HAS_ZSTD, "zstandard not installed")
+    def test_json_zst_file(self):
+        formatter = JsonFormatter(self._json_zst)
+        ds = formatter.load_dataset()
+        self.assertEqual(len(ds), 6)
+        self.assertEqual(list(ds.features.keys()), ["text", "meta"])
+
+    @unittest.skipUnless(HAS_ZSTD, "zstandard not installed")
+    def test_jsonl_zst_file(self):
+        formatter = JsonFormatter(self._jsonl_zst)
+        ds = formatter.load_dataset()
+        self.assertEqual(len(ds), 6)
+        self.assertEqual(list(ds.features.keys()), ["text", "meta"])
 
 
-if __name__ == '__main__':
-    unittest.main() 
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/utils/test_file_utils.py b/tests/utils/test_file_utils.py
@@ -1,55 +1,56 @@
 import os
 import unittest
 import regex as re
+import gzip
 
 from data_juicer.utils.file_utils import (
-    find_files_with_suffix, is_absolute_path,
-    add_suffix_to_filename, create_directory_if_not_exists, transfer_filename,
-    copy_data
+    find_files_with_suffix,
+    is_absolute_path,
+    add_suffix_to_filename,
+    create_directory_if_not_exists,
+    transfer_filename,
+    copy_data,
 )
 from data_juicer.utils.mm_utils import Fields
 
 from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
+
 class FileUtilsTest(DataJuicerTestCaseBase):
 
     def setUp(self) -> None:
         super().setUp()
-        self.temp_output_path = 'tmp/test_file_utils/'
+        self.temp_output_path = "tmp/test_file_utils/"
         os.makedirs(self.temp_output_path)
 
     def tearDown(self):
         if os.path.exists(self.temp_output_path):
-            os.system(f'rm -rf {self.temp_output_path}')
+            os.system(f"rm -rf {self.temp_output_path}")
         super().tearDown()
 
     def test_find_files_with_suffix(self):
         # prepare test files
-        fn_list = ['test1.txt', 'test2.txt', 'test3.md']
+        fn_list = ["test1.txt", "test2.txt", "test3.md"]
         for fn in fn_list:
-            with open(os.path.join(self.temp_output_path, fn), 'w') as f:
+            with open(os.path.join(self.temp_output_path, fn), "w") as f:
                 f.write(fn)
 
-        self.assertEqual(find_files_with_suffix(os.path.join(self.temp_output_path, 'test1.txt')),
-                         {'.txt': [os.path.join(self.temp_output_path, 'test1.txt')]})
+        self.assertEqual(
+            find_files_with_suffix(os.path.join(self.temp_output_path, "test1.txt")),
+            {".txt": [os.path.join(self.temp_output_path, "test1.txt")]},
+        )
         result = find_files_with_suffix(self.temp_output_path)
         expected = {
-            '.txt': sorted([
-                os.path.join(self.temp_output_path, 'test1.txt'),
-                os.path.join(self.temp_output_path, 'test2.txt')
-            ]),
-            '.md': [os.path.join(self.temp_output_path, 'test3.md')]
+            ".txt": sorted([os.path.join(self.temp_output_path, "test1.txt"), os.path.join(self.temp_output_path, "test2.txt")]),
+            ".md": [os.path.join(self.temp_output_path, "test3.md")],
         }
         for suffix in result:
             result[suffix] = sorted(result[suffix])
         self.assertEqual(result, expected)
 
-        result_txt = find_files_with_suffix(self.temp_output_path, 'txt')
+        result_txt = find_files_with_suffix(self.temp_output_path, "txt")
         expected_txt = {
-            '.txt': sorted([
-                os.path.join(self.temp_output_path, 'test1.txt'),
-                os.path.join(self.temp_output_path, 'test2.txt')
-            ])
+            ".txt": sorted([os.path.join(self.temp_output_path, "test1.txt"), os.path.join(self.temp_output_path, "test2.txt")])
         }
         for suffix in result_txt:
             result_txt[suffix] = sorted(result_txt[suffix])
@@ -60,10 +61,10 @@ def test_is_absolute_path(self):
         self.assertTrue(is_absolute_path(os.path.abspath(self.temp_output_path)))
 
     def test_add_suffix_to_filename(self):
-        self.assertEqual(add_suffix_to_filename('test.txt', '_suffix'), 'test_suffix.txt')
-        self.assertEqual(add_suffix_to_filename('test.txt', ''), 'test.txt')
-        self.assertEqual(add_suffix_to_filename('test', '_suffix'), 'test_suffix')
-        self.assertEqual(add_suffix_to_filename('.git', '_suffix'), '.git_suffix')
+        self.assertEqual(add_suffix_to_filename("test.txt", "_suffix"), "test_suffix.txt")
+        self.assertEqual(add_suffix_to_filename("test.txt", ""), "test.txt")
+        self.assertEqual(add_suffix_to_filename("test", "_suffix"), "test_suffix")
+        self.assertEqual(add_suffix_to_filename(".git", "_suffix"), ".git_suffix")
 
     def test_create_directory_if_not_exists(self):
         self.assertTrue(os.path.exists(self.temp_output_path))
@@ -76,55 +77,82 @@ def test_create_directory_if_not_exists(self):
 
     def test_transfer_filename(self):
         # test existing file
-        with open(os.path.join(self.temp_output_path, 'abc.jpg'), 'w') as f:
-            f.write('test')
+        with open(os.path.join(self.temp_output_path, "abc.jpg"), "w") as f:
+            f.write("test")
         self.assertTrue(
             re.match(
-                os.path.join(self.temp_output_path, Fields.multimodal_data_output_dir, 'op1', 'abc__dj_hash_#(.*?)#.jpg'),
-                transfer_filename(os.path.join(self.temp_output_path, 'abc.jpg'), 'op1')))
+                os.path.join(self.temp_output_path, Fields.multimodal_data_output_dir, "op1", "abc__dj_hash_#(.*?)#.jpg"),
+                transfer_filename(os.path.join(self.temp_output_path, "abc.jpg"), "op1"),
+            )
+        )
         # test non-existing file
         self.assertTrue(
             re.match(
-                os.path.join(self.temp_output_path, 'non-existing.jpg'),
-                transfer_filename(os.path.join(self.temp_output_path, 'non-existing.jpg'), 'op1')))
+                os.path.join(self.temp_output_path, "non-existing.jpg"),
+                transfer_filename(os.path.join(self.temp_output_path, "non-existing.jpg"), "op1"),
+            )
+        )
         # test save_dir
         self.temp_output_path = os.path.abspath(self.temp_output_path)
         self.assertTrue(
             re.match(
-                os.path.join(self.temp_output_path, 'tmp_save_dir', 'abc__dj_hash_#(.*?)#.jpg'),
-                transfer_filename(os.path.join(self.temp_output_path, 'abc.jpg'), 'op1', 
-                                  save_dir=os.path.join(self.temp_output_path, 'tmp_save_dir'))))
+                os.path.join(self.temp_output_path, "tmp_save_dir", "abc__dj_hash_#(.*?)#.jpg"),
+                transfer_filename(
+                    os.path.join(self.temp_output_path, "abc.jpg"),
+                    "op1",
+                    save_dir=os.path.join(self.temp_output_path, "tmp_save_dir"),
+                ),
+            )
+        )
         # test env dir
         try:
-            ori_env_dir = os.environ.get('DJ_PRODUCED_DATA_DIR', None)
-            test_env_dir = os.path.join(self.temp_output_path, 'tmp_env_dir')
-            os.environ['DJ_PRODUCED_DATA_DIR'] = test_env_dir
+            ori_env_dir = os.environ.get("DJ_PRODUCED_DATA_DIR", None)
+            test_env_dir = os.path.join(self.temp_output_path, "tmp_env_dir")
+            os.environ["DJ_PRODUCED_DATA_DIR"] = test_env_dir
 
-            transfer_filename(os.path.join(self.temp_output_path, 'abc.jpg'), 'op1')
+            transfer_filename(os.path.join(self.temp_output_path, "abc.jpg"), "op1")
             self.assertTrue(
                 re.match(
-                    os.path.join(test_env_dir, 'op1', 'abc__dj_hash_#(.*?)#.jpg'),
-                    transfer_filename(os.path.join(self.temp_output_path, 'abc.jpg'), 'op1')))
+                    os.path.join(test_env_dir, "op1", "abc__dj_hash_#(.*?)#.jpg"),
+                    transfer_filename(os.path.join(self.temp_output_path, "abc.jpg"), "op1"),
+                )
+            )
         finally:
             if ori_env_dir:
-                os.environ['DJ_PRODUCED_DATA_DIR'] = ori_env_dir
-            elif 'DJ_PRODUCED_DATA_DIR' in os.environ:
-                del os.environ['DJ_PRODUCED_DATA_DIR']
+                os.environ["DJ_PRODUCED_DATA_DIR"] = ori_env_dir
+            elif "DJ_PRODUCED_DATA_DIR" in os.environ:
+                del os.environ["DJ_PRODUCED_DATA_DIR"]
 
     def test_copy_data(self):
-        tgt_fn = 'test.txt'
-        ori_dir = os.path.join(self.temp_output_path, 'test1')
-        tgt_dir = os.path.join(self.temp_output_path, 'test2')
+        tgt_fn = "test.txt"
+        ori_dir = os.path.join(self.temp_output_path, "test1")
+        tgt_dir = os.path.join(self.temp_output_path, "test2")
 
         self.assertFalse(copy_data(ori_dir, tgt_dir, tgt_fn))
 
         os.makedirs(ori_dir, exist_ok=True)
-        with open(os.path.join(ori_dir, tgt_fn), 'w') as f:
-            f.write('test')
+        with open(os.path.join(ori_dir, tgt_fn), "w") as f:
+            f.write("test")
 
         self.assertTrue(copy_data(ori_dir, tgt_dir, tgt_fn))
         self.assertTrue(os.path.exists(os.path.join(tgt_dir, tgt_fn)))
 
+    def test_find_files_with_suffix_gzip(self):
+        # create a gzip compressed jsonl file and ensure it is detected as '.jsonl.gz'
+        content = '{"text": "gzip test"}\n'
+        gz_path = os.path.join(self.temp_output_path, "demo-dataset.jsonl.gz")
+        with gzip.open(gz_path, "wb") as f:
+            f.write(content.encode("utf-8"))
+
+        result = find_files_with_suffix(self.temp_output_path)
+
+        # normalize lists for comparison
+        for suffix in result:
+            result[suffix] = sorted(result[suffix])
+
+        self.assertIn(".jsonl.gz", result)
+        self.assertEqual(result[".jsonl.gz"], [gz_path])
+
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()