From 11412bdf7ddc110fcfbfcc3e87540a2fe32fe27e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20Mart=C3=ADnez?= Date: Tue, 7 May 2024 05:58:57 -0400 Subject: [PATCH] Add note to start_index parameter --- nemo_curator/utils/file_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo_curator/utils/file_utils.py b/nemo_curator/utils/file_utils.py index c0415b827..cc846c36d 100644 --- a/nemo_curator/utils/file_utils.py +++ b/nemo_curator/utils/file_utils.py @@ -222,7 +222,8 @@ def reshard_jsonl( output_dir: The output directory where the resharded jsonl files will be written output_file_size: Approximate size of output files. Must specify with a string and with the unit K, M or G for kilo, mega or gigabytes - start_index: Starting index for naming the output files + start_index: Starting index for naming the output files. Note: The indices may not + be continuous if the sharding process would output an empty file in its place file_prefix: Prefix to use to prepend to output file number """