diff --git a/docs/user-guide/distributeddataclassification.rst b/docs/user-guide/distributeddataclassification.rst index 389e8ef1b..9b193098b 100644 --- a/docs/user-guide/distributeddataclassification.rst +++ b/docs/user-guide/distributeddataclassification.rst @@ -61,7 +61,7 @@ Let's see how ``DomainClassifier`` works in a small excerpt taken from ``example from nemo_curator.classifiers import DomainClassifier - files = get_all_files_paths_under("books_dataset/") + files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl") input_dataset = DocumentDataset.read_json(files, backend="cudf") domain_classifier = DomainClassifier(filter_by=["Games", "Sports"]) @@ -83,7 +83,7 @@ Using the ``MultilingualDomainClassifier`` is very similar to using the ``Domain from nemo_curator.classifiers import MultilingualDomainClassifier - files = get_all_files_paths_under("japanese_books_dataset/") + files = get_all_files_paths_under("japanese_books_dataset/", keep_extensions="jsonl") input_dataset = DocumentDataset.read_json(files, backend="cudf") multilingual_domain_classifier = MultilingualDomainClassifier( @@ -106,7 +106,7 @@ Here's an example of how to use the ``QualityClassifier``: from nemo_curator.classifiers import QualityClassifier - files = get_all_files_paths_under("web_documents/") + files = get_all_files_paths_under("web_documents/", keep_extensions="jsonl") input_dataset = DocumentDataset.read_json(files, backend="cudf") quality_classifier = QualityClassifier(filter_by=["High", "Medium"]) @@ -134,7 +134,7 @@ NeMo Curator provides an easy way to annotate and filter your data using the saf .. code-block:: python - files = get_all_files_paths_under("unsafe_documents/") + files = get_all_files_paths_under("unsafe_documents/", keep_extensions="jsonl") input_dataset = DocumentDataset.read_json(files, backend="cudf") token = "hf_1234" # Replace with your user access token @@ -181,7 +181,7 @@ Here is a small example of how to use the ``InstructionDataGuardClassifier``: # The model expects instruction-response style text data. For example: # "Instruction: {instruction}. Input: {input_}. Response: {response}." - files = get_all_files_paths_under("instruction_input_response_dataset/") + files = get_all_files_paths_under("instruction_input_response_dataset/", keep_extensions="jsonl") input_dataset = DocumentDataset.read_json(files, backend="cudf") token = "hf_1234" # Replace with your user access token @@ -210,7 +210,7 @@ To use the FineWeb Educational Content Classifier, you can follow this example: from nemo_curator.classifiers import FineWebEduClassifier - files = get_all_files_paths_under("web_documents/") + files = get_all_files_paths_under("web_documents/", keep_extensions="jsonl") input_dataset = DocumentDataset.read_json(files, backend="cudf") edu_classifier = FineWebEduClassifier( @@ -247,7 +247,7 @@ Let's see how ``ContentTypeClassifier`` works in a small excerpt taken from ``ex from nemo_curator.classifiers import ContentTypeClassifier - files = get_all_files_paths_under("books_dataset/") + files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl") input_dataset = DocumentDataset.read_json(files, backend="cudf") content_type_classifier = ContentTypeClassifier(filter_by=["Blogs", "News"]) @@ -269,7 +269,7 @@ Here's an example of how to use the ``PromptTaskComplexityClassifier``: from nemo_curator.classifiers import PromptTaskComplexityClassifier - files = get_all_files_paths_under("my_dataset/") + files = get_all_files_paths_under("my_dataset/", keep_extensions="jsonl") input_dataset = DocumentDataset.read_json(files, backend="cudf") classifier = PromptTaskComplexityClassifier() diff --git a/docs/user-guide/documentdataset.rst b/docs/user-guide/documentdataset.rst index 445a09333..1cce1bb04 100644 --- a/docs/user-guide/documentdataset.rst +++ b/docs/user-guide/documentdataset.rst @@ -43,7 +43,7 @@ You could read, filter the dataset, and write it using the following methods from nemo_curator.utils.file_utils import get_all_files_paths_under from nemo_curator.filters import WordCountFilter - files = get_all_files_paths_under("books_dataset/") + files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl") books = DocumentDataset.read_json(files, add_filename=True) filter_step = nc.ScoreFilter( @@ -58,7 +58,7 @@ You could read, filter the dataset, and write it using the following methods Let's walk through this code line by line. -* ``files = get_all_files_paths_under("books_dataset/")`` This retrieves a list of all files in the given directory. +* ``files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl")`` This retrieves a list of all files in the given directory, then filters the list to include only files ending with ".jsonl". In our case, this is equivalent to writing .. code-block:: python diff --git a/docs/user-guide/qualityfiltering.rst b/docs/user-guide/qualityfiltering.rst index ba2c34ad6..5060fb1ec 100644 --- a/docs/user-guide/qualityfiltering.rst +++ b/docs/user-guide/qualityfiltering.rst @@ -35,7 +35,7 @@ Let's examine this small example: from nemo_curator.utils.file_utils import get_all_files_paths_under from nemo_curator.filters import WordCountFilter - files = get_all_files_paths_under("books_dataset/") + files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl") books = DocumentDataset.read_json(files, add_filename=True) filter_step = nc.ScoreFilter( diff --git a/docs/user-guide/sparkother.rst b/docs/user-guide/sparkother.rst index 0da312e15..19a09c3d4 100644 --- a/docs/user-guide/sparkother.rst +++ b/docs/user-guide/sparkother.rst @@ -91,4 +91,4 @@ The following code snippet demonstrates how to read output from a Spark DataFram stories_dataset = DocumentDataset.read_parquet(processed_files, backend="pandas") It is worth noting that Spark typically tends to create checksum and other marker files which can vary by Spark distribution, -so it is advisable to ignore them when reading data into a NeMo Curator ``DocumentDataset``. \ No newline at end of file +so it is advisable to ignore them when reading data into a NeMo Curator ``DocumentDataset``. diff --git a/docs/user-guide/taskdecontamination.rst b/docs/user-guide/taskdecontamination.rst index 46a0d9804..e62c78e06 100644 --- a/docs/user-guide/taskdecontamination.rst +++ b/docs/user-guide/taskdecontamination.rst @@ -28,7 +28,7 @@ Let's examine this small example: from nemo_curator.utils.file_utils import get_all_files_paths_under from nemo_curator.tasks import Winogrande, Squad, TriviaQA, - files = get_all_files_paths_under("books_dataset/") + files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl") books = DocumentDataset.read_json(files, add_filename=True) downstream_tasks = [ diff --git a/examples/classifier_filtering.py b/examples/classifier_filtering.py index a6476e395..03a09b0ad 100644 --- a/examples/classifier_filtering.py +++ b/examples/classifier_filtering.py @@ -27,7 +27,7 @@ def load_dataset(input_data_dir): - files = list(get_all_files_paths_under(input_data_dir)) + files = list(get_all_files_paths_under(input_data_dir, keep_extensions="jsonl")) raw_data = read_data(files, file_type="jsonl", backend="pandas", add_filename=True) dataset = DocumentDataset(raw_data) diff --git a/examples/exact_deduplication.py b/examples/exact_deduplication.py index 81a2d66c1..ca2cda728 100644 --- a/examples/exact_deduplication.py +++ b/examples/exact_deduplication.py @@ -17,8 +17,7 @@ from nemo_curator.datasets import DocumentDataset from nemo_curator.modules import ExactDuplicates -from nemo_curator.utils.distributed_utils import get_client, read_data, write_to_disk -from nemo_curator.utils.file_utils import get_all_files_paths_under +from nemo_curator.utils.distributed_utils import get_client, write_to_disk from nemo_curator.utils.script_utils import ArgumentHelper diff --git a/examples/identify_languages.py b/examples/identify_languages.py index 2a090da0a..0704aac0b 100644 --- a/examples/identify_languages.py +++ b/examples/identify_languages.py @@ -26,7 +26,7 @@ def load_dataset(input_data_dir): - files = list(get_all_files_paths_under(input_data_dir)) + files = list(get_all_files_paths_under(input_data_dir, keep_extensions="jsonl")) raw_data = read_data(files, file_type="jsonl", backend="pandas", add_filename=True) dataset = DocumentDataset(raw_data) diff --git a/examples/task_decontamination.py b/examples/task_decontamination.py index daf707c31..f6162c018 100644 --- a/examples/task_decontamination.py +++ b/examples/task_decontamination.py @@ -44,7 +44,7 @@ def load_dataset(input_data_dir): - files = list(get_all_files_paths_under(input_data_dir)) + files = list(get_all_files_paths_under(input_data_dir, keep_extensions="jsonl")) raw_data = read_data(files, file_type="jsonl", backend="pandas", add_filename=True) dataset = DocumentDataset(raw_data) diff --git a/nemo_curator/scripts/find_exact_duplicates.py b/nemo_curator/scripts/find_exact_duplicates.py index e71ef5e11..5f6fc2435 100644 --- a/nemo_curator/scripts/find_exact_duplicates.py +++ b/nemo_curator/scripts/find_exact_duplicates.py @@ -55,8 +55,9 @@ def main(args): if num_files is not None and num_files <= 0: logger.info(f"Processed {num_files}... quitting") break - files = get_all_files_paths_under(root=data_path, recurse_subdirectories=False) - files = [f for f in files if f.endswith(".jsonl")] + files = get_all_files_paths_under( + root=data_path, recurse_subdirectories=False, keep_extensions="jsonl" + ) df = read_data( files[:num_files] if num_files else files, file_type="jsonl", diff --git a/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py b/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py index 2a5b9d3b5..f15a70867 100644 --- a/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py +++ b/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py @@ -70,8 +70,9 @@ def main(args): print(f"Processed {args.num_files}... quitting") break - files = get_all_files_paths_under(root=data_path, recurse_subdirectories=False) - files = [f for f in files if f.endswith(".jsonl")] + files = get_all_files_paths_under( + root=data_path, recurse_subdirectories=False, keep_extensions="jsonl" + ) df = read_data( files[:num_files] if num_files else files, file_type="jsonl", diff --git a/nemo_curator/scripts/prepare_fasttext_training_data.py b/nemo_curator/scripts/prepare_fasttext_training_data.py index c3a95683c..572cb1108 100644 --- a/nemo_curator/scripts/prepare_fasttext_training_data.py +++ b/nemo_curator/scripts/prepare_fasttext_training_data.py @@ -32,7 +32,9 @@ def sample_rows(df, n, seed): def main(args): client = get_client(**ArgumentHelper.parse_client_args(args)) # Get local path - files = list(get_all_files_paths_under(args.input_data_dir)) + files = list( + get_all_files_paths_under(args.input_data_dir, keep_extensions="jsonl") + ) raw_data = read_data(files, file_type="jsonl", backend="pandas") dataset = DocumentDataset(raw_data) text_field = args.input_json_field diff --git a/nemo_curator/utils/file_utils.py b/nemo_curator/utils/file_utils.py index 5df61a2f6..4f8d6db09 100644 --- a/nemo_curator/utils/file_utils.py +++ b/nemo_curator/utils/file_utils.py @@ -452,7 +452,7 @@ def reshard_jsonl( # Output file size in bytes blocksize = parse_str_of_num_bytes(output_file_size) - input_files = list(get_all_files_paths_under(input_dir)) + input_files = list(get_all_files_paths_under(input_dir, keep_extensions="jsonl")) # Read in the dask bag b = db.read_text(input_files, blocksize=blocksize) diff --git a/tests/test_read_data.py b/tests/test_read_data.py index 0c9a2aa55..66a09769f 100644 --- a/tests/test_read_data.py +++ b/tests/test_read_data.py @@ -9,7 +9,6 @@ read_data_blocksize, read_data_files_per_partition, ) -from nemo_curator.utils.file_utils import get_all_files_paths_under NUM_FILES = 5 NUM_RECORDS = 100 diff --git a/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb b/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb index ae4adffd0..e14699a20 100644 --- a/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb +++ b/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb @@ -1459,8 +1459,9 @@ } ], "source": [ - "files = get_all_files_paths_under(root=input_data_dir, recurse_subdirectories=False)\n", - "files = [f for f in files if f.endswith(\".jsonl\")]\n", + "files = get_all_files_paths_under(\n", + " root=input_data_dir, recurse_subdirectories=False, keep_extensions=\"jsonl\"\n", + ")\n", "df = read_data(\n", " files,\n", " file_type=\"jsonl\",\n", diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb index 2512cf736..2ab871fa5 100644 --- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb +++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb @@ -1140,8 +1140,9 @@ "print(f\"Computing minhashes for {minhash_data_path}\")\n", "\n", "# Load data. Only the [minhash_id_field, text_field] columns are needed\n", - "files = get_all_files_paths_under(root=minhash_data_path, recurse_subdirectories=False)\n", - "files = [f for f in files if f.endswith(\".jsonl\")]\n", + "files = get_all_files_paths_under(\n", + " root=minhash_data_path, recurse_subdirectories=False, keep_extensions=\"jsonl\"\n", + ")\n", "df = read_data(\n", " files,\n", " file_type=\"jsonl\",\n", diff --git a/tutorials/tinystories/main.py b/tutorials/tinystories/main.py index 84e9948ee..eb965f0b3 100644 --- a/tutorials/tinystories/main.py +++ b/tutorials/tinystories/main.py @@ -176,9 +176,9 @@ def run_curation_pipeline(args: Any, jsonl_dir: str) -> None: client = get_client(**ArgumentHelper.parse_client_args(args)) print(f"Running curation pipeline on '{jsonl_dir}'...") files = [ - fp - for fp in get_all_files_paths_under(jsonl_dir, recurse_subdirectories=False) - if fp.endswith(".jsonl") + get_all_files_paths_under( + jsonl_dir, recurse_subdirectories=False, keep_extensions="jsonl" + ) ] print("Reading the data...") orig_dataset = DocumentDataset.read_json(files, add_filename=True) diff --git a/tutorials/zyda2-tutorial/1_fuzzy_dedup/0_minhash.py b/tutorials/zyda2-tutorial/1_fuzzy_dedup/0_minhash.py index fcbbf9dde..ce86c9213 100644 --- a/tutorials/zyda2-tutorial/1_fuzzy_dedup/0_minhash.py +++ b/tutorials/zyda2-tutorial/1_fuzzy_dedup/0_minhash.py @@ -13,8 +13,7 @@ def read_folder(input_folder, columns=["nemo_id", "text"]): - data_paths = get_all_files_paths_under(input_folder) - data_paths = [f for f in data_paths if f.endswith(".parquet")] + data_paths = get_all_files_paths_under(input_folder, keep_extensions="parquet") data_paths.sort() logging.info(f"Number of files being read: {len(data_paths)}") text_ddf = dask_cudf.read_parquet(