Add Docstring based on Sarah's review

Signed-off-by: Vibhu Jawa <[email protected]>
NVIDIA · Feb 5, 2025 · d160473 · d160473
1 parent 2272d3d
commit d160473
Show file tree

Hide file tree

Showing 2 changed files with 41 additions and 2 deletions.
diff --git a/nemo_curator/datasets/doc_dataset.py b/nemo_curator/datasets/doc_dataset.py
@@ -163,8 +163,26 @@ def to_json(
         partition_on: Optional[str] = None,
     ):
         """
-        See nemo_curator.utils.distributed_utils.write_to_disk docstring for parameters.
+        Writes the dataset to the specified path in JSONL format.
 
+        If `write_to_filename` is True, the DataFrame is expected to have a column
+        that specifies the filename for each document. This column can be named
+        `file_name` by default, or a custom name if `write_to_filename` is a string.
+
+        Args:
+            output_path (str): The directory or file path where the dataset will be written.
+            write_to_filename (Union[bool, str]): Determines how filenames are handled.
+                - If True, uses the `file_name` column in the DataFrame to determine filenames.
+                - If a string, uses that string as the column name for filenames.
+                - If False, writes all data to the specified `output_path`.
+            keep_filename_column (bool): If True, retains the filename column in the output.
+                If False, the filename column is dropped from the output.
+            partition_on (Optional[str]): The column name used to partition the data.
+                If specified, data is partitioned based on unique values in this column,
+                with each partition written to a separate directory.
+
+        For more details, refer to the `write_to_disk` function in
+        `nemo_curator.utils.distributed_utils`.
         """
         write_to_disk(
             df=self.df,
@@ -183,8 +201,26 @@ def to_parquet(
         partition_on: Optional[str] = None,
     ):
         """
-        See nemo_curator.utils.distributed_utils.write_to_disk docstring for parameters.
+        Writes the dataset to the specified path in Parquet format.
 
+        If `write_to_filename` is True, the DataFrame is expected to have a column
+        that specifies the filename for each document. This column can be named
+        `file_name` by default, or a custom name if `write_to_filename` is a string.
+
+        Args:
+            output_path (str): The directory or file path where the dataset will be written.
+            write_to_filename (Union[bool, str]): Determines how filenames are handled.
+                - If True, uses the `file_name` column in the DataFrame to determine filenames.
+                - If a string, uses that string as the column name for filenames.
+                - If False, writes all data to the specified `output_path`.
+            keep_filename_column (bool): If True, retains the filename column in the output.
+                If False, the filename column is dropped from the output.
+            partition_on (Optional[str]): The column name used to partition the data.
+                If specified, data is partitioned based on unique values in this column,
+                with each partition written to a separate directory.
+
+        For more details, refer to the `write_to_disk` function in
+        `nemo_curator.utils.distributed_utils`.
         """
         write_to_disk(
             df=self.df,

diff --git a/nemo_curator/utils/distributed_utils.py b/nemo_curator/utils/distributed_utils.py
@@ -858,6 +858,9 @@ def write_to_disk(
                 If str, uses that as the filename column to write to.
         keep_filename_column: Boolean representing whether to keep or drop the filename column, if it exists.
         output_type: The type of output file to write. Can be "jsonl" or "parquet".
+        partition_on: The column name to partition the data on.
+                      If specified, the data will be partitioned based on the unique values in this column,
+                      and each partition will be written to a separate directory
     """
 
     filename_col = _resolve_filename_col(write_to_filename)