11import os
22import unittest
33import regex as re
4+ import gzip
45
56from data_juicer .utils .file_utils import (
6- find_files_with_suffix , is_absolute_path ,
7- add_suffix_to_filename , create_directory_if_not_exists , transfer_filename ,
8- copy_data
7+ find_files_with_suffix ,
8+ is_absolute_path ,
9+ add_suffix_to_filename ,
10+ create_directory_if_not_exists ,
11+ transfer_filename ,
12+ copy_data ,
913)
1014from data_juicer .utils .mm_utils import Fields
1115
1216from data_juicer .utils .unittest_utils import DataJuicerTestCaseBase
1317
18+
1419class FileUtilsTest (DataJuicerTestCaseBase ):
1520
1621 def setUp (self ) -> None :
1722 super ().setUp ()
18- self .temp_output_path = ' tmp/test_file_utils/'
23+ self .temp_output_path = " tmp/test_file_utils/"
1924 os .makedirs (self .temp_output_path )
2025
2126 def tearDown (self ):
2227 if os .path .exists (self .temp_output_path ):
23- os .system (f' rm -rf { self .temp_output_path } ' )
28+ os .system (f" rm -rf { self .temp_output_path } " )
2429 super ().tearDown ()
2530
2631 def test_find_files_with_suffix (self ):
2732 # prepare test files
28- fn_list = [' test1.txt' , ' test2.txt' , ' test3.md' ]
33+ fn_list = [" test1.txt" , " test2.txt" , " test3.md" ]
2934 for fn in fn_list :
30- with open (os .path .join (self .temp_output_path , fn ), 'w' ) as f :
35+ with open (os .path .join (self .temp_output_path , fn ), "w" ) as f :
3136 f .write (fn )
3237
33- self .assertEqual (find_files_with_suffix (os .path .join (self .temp_output_path , 'test1.txt' )),
34- {'.txt' : [os .path .join (self .temp_output_path , 'test1.txt' )]})
38+ self .assertEqual (
39+ find_files_with_suffix (os .path .join (self .temp_output_path , "test1.txt" )),
40+ {".txt" : [os .path .join (self .temp_output_path , "test1.txt" )]},
41+ )
3542 result = find_files_with_suffix (self .temp_output_path )
3643 expected = {
37- '.txt' : sorted ([
38- os .path .join (self .temp_output_path , 'test1.txt' ),
39- os .path .join (self .temp_output_path , 'test2.txt' )
40- ]),
41- '.md' : [os .path .join (self .temp_output_path , 'test3.md' )]
44+ ".txt" : sorted ([os .path .join (self .temp_output_path , "test1.txt" ), os .path .join (self .temp_output_path , "test2.txt" )]),
45+ ".md" : [os .path .join (self .temp_output_path , "test3.md" )],
4246 }
4347 for suffix in result :
4448 result [suffix ] = sorted (result [suffix ])
4549 self .assertEqual (result , expected )
4650
47- result_txt = find_files_with_suffix (self .temp_output_path , ' txt' )
51+ result_txt = find_files_with_suffix (self .temp_output_path , " txt" )
4852 expected_txt = {
49- '.txt' : sorted ([
50- os .path .join (self .temp_output_path , 'test1.txt' ),
51- os .path .join (self .temp_output_path , 'test2.txt' )
52- ])
53+ ".txt" : sorted ([os .path .join (self .temp_output_path , "test1.txt" ), os .path .join (self .temp_output_path , "test2.txt" )])
5354 }
5455 for suffix in result_txt :
5556 result_txt [suffix ] = sorted (result_txt [suffix ])
@@ -60,10 +61,10 @@ def test_is_absolute_path(self):
6061 self .assertTrue (is_absolute_path (os .path .abspath (self .temp_output_path )))
6162
6263 def test_add_suffix_to_filename (self ):
63- self .assertEqual (add_suffix_to_filename (' test.txt' , ' _suffix' ), ' test_suffix.txt' )
64- self .assertEqual (add_suffix_to_filename (' test.txt' , '' ), ' test.txt' )
65- self .assertEqual (add_suffix_to_filename (' test' , ' _suffix' ), ' test_suffix' )
66- self .assertEqual (add_suffix_to_filename (' .git' , ' _suffix' ), ' .git_suffix' )
64+ self .assertEqual (add_suffix_to_filename (" test.txt" , " _suffix" ), " test_suffix.txt" )
65+ self .assertEqual (add_suffix_to_filename (" test.txt" , "" ), " test.txt" )
66+ self .assertEqual (add_suffix_to_filename (" test" , " _suffix" ), " test_suffix" )
67+ self .assertEqual (add_suffix_to_filename (" .git" , " _suffix" ), " .git_suffix" )
6768
6869 def test_create_directory_if_not_exists (self ):
6970 self .assertTrue (os .path .exists (self .temp_output_path ))
@@ -76,55 +77,82 @@ def test_create_directory_if_not_exists(self):
7677
7778 def test_transfer_filename (self ):
7879 # test existing file
79- with open (os .path .join (self .temp_output_path , ' abc.jpg' ), 'w' ) as f :
80- f .write (' test' )
80+ with open (os .path .join (self .temp_output_path , " abc.jpg" ), "w" ) as f :
81+ f .write (" test" )
8182 self .assertTrue (
8283 re .match (
83- os .path .join (self .temp_output_path , Fields .multimodal_data_output_dir , 'op1' , 'abc__dj_hash_#(.*?)#.jpg' ),
84- transfer_filename (os .path .join (self .temp_output_path , 'abc.jpg' ), 'op1' )))
84+ os .path .join (self .temp_output_path , Fields .multimodal_data_output_dir , "op1" , "abc__dj_hash_#(.*?)#.jpg" ),
85+ transfer_filename (os .path .join (self .temp_output_path , "abc.jpg" ), "op1" ),
86+ )
87+ )
8588 # test non-existing file
8689 self .assertTrue (
8790 re .match (
88- os .path .join (self .temp_output_path , 'non-existing.jpg' ),
89- transfer_filename (os .path .join (self .temp_output_path , 'non-existing.jpg' ), 'op1' )))
91+ os .path .join (self .temp_output_path , "non-existing.jpg" ),
92+ transfer_filename (os .path .join (self .temp_output_path , "non-existing.jpg" ), "op1" ),
93+ )
94+ )
9095 # test save_dir
9196 self .temp_output_path = os .path .abspath (self .temp_output_path )
9297 self .assertTrue (
9398 re .match (
94- os .path .join (self .temp_output_path , 'tmp_save_dir' , 'abc__dj_hash_#(.*?)#.jpg' ),
95- transfer_filename (os .path .join (self .temp_output_path , 'abc.jpg' ), 'op1' ,
96- save_dir = os .path .join (self .temp_output_path , 'tmp_save_dir' ))))
99+ os .path .join (self .temp_output_path , "tmp_save_dir" , "abc__dj_hash_#(.*?)#.jpg" ),
100+ transfer_filename (
101+ os .path .join (self .temp_output_path , "abc.jpg" ),
102+ "op1" ,
103+ save_dir = os .path .join (self .temp_output_path , "tmp_save_dir" ),
104+ ),
105+ )
106+ )
97107 # test env dir
98108 try :
99- ori_env_dir = os .environ .get (' DJ_PRODUCED_DATA_DIR' , None )
100- test_env_dir = os .path .join (self .temp_output_path , ' tmp_env_dir' )
101- os .environ [' DJ_PRODUCED_DATA_DIR' ] = test_env_dir
109+ ori_env_dir = os .environ .get (" DJ_PRODUCED_DATA_DIR" , None )
110+ test_env_dir = os .path .join (self .temp_output_path , " tmp_env_dir" )
111+ os .environ [" DJ_PRODUCED_DATA_DIR" ] = test_env_dir
102112
103- transfer_filename (os .path .join (self .temp_output_path , ' abc.jpg' ), ' op1' )
113+ transfer_filename (os .path .join (self .temp_output_path , " abc.jpg" ), " op1" )
104114 self .assertTrue (
105115 re .match (
106- os .path .join (test_env_dir , 'op1' , 'abc__dj_hash_#(.*?)#.jpg' ),
107- transfer_filename (os .path .join (self .temp_output_path , 'abc.jpg' ), 'op1' )))
116+ os .path .join (test_env_dir , "op1" , "abc__dj_hash_#(.*?)#.jpg" ),
117+ transfer_filename (os .path .join (self .temp_output_path , "abc.jpg" ), "op1" ),
118+ )
119+ )
108120 finally :
109121 if ori_env_dir :
110- os .environ [' DJ_PRODUCED_DATA_DIR' ] = ori_env_dir
111- elif ' DJ_PRODUCED_DATA_DIR' in os .environ :
112- del os .environ [' DJ_PRODUCED_DATA_DIR' ]
122+ os .environ [" DJ_PRODUCED_DATA_DIR" ] = ori_env_dir
123+ elif " DJ_PRODUCED_DATA_DIR" in os .environ :
124+ del os .environ [" DJ_PRODUCED_DATA_DIR" ]
113125
114126 def test_copy_data (self ):
115- tgt_fn = ' test.txt'
116- ori_dir = os .path .join (self .temp_output_path , ' test1' )
117- tgt_dir = os .path .join (self .temp_output_path , ' test2' )
127+ tgt_fn = " test.txt"
128+ ori_dir = os .path .join (self .temp_output_path , " test1" )
129+ tgt_dir = os .path .join (self .temp_output_path , " test2" )
118130
119131 self .assertFalse (copy_data (ori_dir , tgt_dir , tgt_fn ))
120132
121133 os .makedirs (ori_dir , exist_ok = True )
122- with open (os .path .join (ori_dir , tgt_fn ), 'w' ) as f :
123- f .write (' test' )
134+ with open (os .path .join (ori_dir , tgt_fn ), "w" ) as f :
135+ f .write (" test" )
124136
125137 self .assertTrue (copy_data (ori_dir , tgt_dir , tgt_fn ))
126138 self .assertTrue (os .path .exists (os .path .join (tgt_dir , tgt_fn )))
127139
140+ def test_find_files_with_suffix_gzip (self ):
141+ # create a gzip compressed jsonl file and ensure it is detected as '.jsonl.gz'
142+ content = '{"text": "gzip test"}\n '
143+ gz_path = os .path .join (self .temp_output_path , "demo-dataset.jsonl.gz" )
144+ with gzip .open (gz_path , "wb" ) as f :
145+ f .write (content .encode ("utf-8" ))
146+
147+ result = find_files_with_suffix (self .temp_output_path )
148+
149+ # normalize lists for comparison
150+ for suffix in result :
151+ result [suffix ] = sorted (result [suffix ])
152+
153+ self .assertIn (".jsonl.gz" , result )
154+ self .assertEqual (result [".jsonl.gz" ], [gz_path ])
155+
128156
129- if __name__ == ' __main__' :
157+ if __name__ == " __main__" :
130158 unittest .main ()
0 commit comments