diff --git a/.github/workflows/build-test-publish-wheel.yml b/.github/workflows/build-test-publish-wheel.yml
index 4e3d0a897..58dad0431 100644
--- a/.github/workflows/build-test-publish-wheel.yml
+++ b/.github/workflows/build-test-publish-wheel.yml
@@ -27,20 +27,12 @@ defaults:
jobs:
build-test-publish-wheel:
- uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_build_test_publish_wheel.yml@v0.7.0
+ uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_build_test_publish_wheel.yml@v0.20.0
with:
- image-name: nemo_curator_container
- dockerfile: Dockerfile
- image-label: nemo-curator
- build-args: |
- IMAGE_LABEL=nemo-curator
- REPO_URL=https://github.com/${{ github.repository }}.git
- CURATOR_COMMIT=${{ github.sha }}
- prune-filter-timerange: 24h
dry-run: true
python-package: nemo_curator
- container-workdir: /opt/NeMo-Curator/
environment: public
+ python-version: '3.10'
secrets:
TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 34a087ea0..7f1a04064 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -31,19 +31,11 @@ on:
description: Branch to target for version bump
jobs:
release:
- uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.18.4
+ uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.20.1
with:
release-ref: ${{ inputs.release-ref }}
- image-name: nemo_curator_container
- dockerfile: Dockerfile
- image-label: nemo-curator
- build-args: |
- IMAGE_LABEL=nemo-curator
- REPO_URL=https://github.com/${{ github.repository }}.git
- CURATOR_COMMIT=${{ inputs.release-ref }}
- prune-filter-timerange: 24h
python-package: nemo_curator
- container-workdir: /opt/NeMo-Curator
+ python-version: '3.10'
library-name: NeMo Curator
dry-run: ${{ inputs.dry-run }}
version-bump-branch: ${{ inputs.version-bump-branch }}
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 1d8cc9258..ac5f822ff 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -19,7 +19,7 @@ jobs:
fail-fast: false
matrix:
os: [ubuntu-latest]
- python-version: ["3.10"]
+ python-version: ["3.10", "3.12"]
steps:
- uses: actions/checkout@v4
- name: Optionally free up space on Ubuntu
diff --git a/Dockerfile b/Dockerfile
index b0d1bedc7..1bcfc7d38 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,8 +1,8 @@
# See https://github.com/rapidsai/ci-imgs for ARG options
-# NeMo Curator requires Python 3.10, Ubuntu 22.04/20.04, and CUDA 12 (or above)
+# NeMo Curator requires Python 3.12, Ubuntu 22.04/20.04, and CUDA 12 (or above)
ARG CUDA_VER=12.5.1
ARG LINUX_VER=ubuntu22.04
-ARG PYTHON_VER=3.10
+ARG PYTHON_VER=3.12
ARG IMAGE_LABEL
ARG REPO_URL
ARG CURATOR_COMMIT
@@ -33,7 +33,7 @@ ARG CUDA_VER
# Install the minimal libcu* libraries needed by NeMo Curator
RUN conda create -y --name curator -c nvidia/label/cuda-${CUDA_VER} -c conda-forge \
- python=3.10 \
+ python=3.12 \
cuda-cudart \
libcufft \
libcublas \
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 000000000..1aba38f67
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1 @@
+include LICENSE
diff --git a/README.md b/README.md
index f4d18cbc4..d52129f46 100644
--- a/README.md
+++ b/README.md
@@ -69,7 +69,7 @@ This section explains how to install NeMo Curator and use the Python library, Py
Before installing NeMo Curator, ensure that the following requirements are met:
-- Python 3.10
+- Python 3.10 or higher
- Ubuntu 22.04/20.04
- NVIDIA GPU (optional)
- Volta™ or higher ([compute capability 7.0+](https://developer.nvidia.com/cuda-gpus))
@@ -158,7 +158,7 @@ To get started with NeMo Curator, you can follow the tutorials [available here](
- [`tinystories`](https://github.com/NVIDIA/NeMo-Curator/tree/main/tutorials/tinystories) which focuses on data curation for training LLMs from scratch.
- [`peft-curation`](https://github.com/NVIDIA/NeMo-Curator/tree/main/tutorials/peft-curation) which focuses on data curation for LLM parameter-efficient fine-tuning (PEFT) use-cases.
-- [`distributed_data_classification`](https://github.com/NVIDIA/NeMo-Curator/tree/main/tutorials/distributed_data_classification) which focuses on using the domain and quality classifiers to help with data annotation.
+- [`distributed_data_classification`](https://github.com/NVIDIA/NeMo-Curator/tree/main/tutorials/distributed_data_classification) which demonstrates how to use NVIDIA's Hugging Face classifiers to help with data annotation.
- [`single_node_tutorial`](https://github.com/NVIDIA/NeMo-Curator/tree/main/tutorials/single_node_tutorial) which demonstrates an end-to-end data curation pipeline for curating Wikipedia data in Thai.
- [`image-curation`](https://github.com/NVIDIA/NeMo-Curator/blob/main/tutorials/image-curation/image-curation.ipynb) which explores the scalable image curation modules.
diff --git a/config/sem_dedup_config.yaml b/config/sem_dedup_config.yaml
index 39787d2fb..08366d43a 100644
--- a/config/sem_dedup_config.yaml
+++ b/config/sem_dedup_config.yaml
@@ -6,6 +6,7 @@ num_files: 16
embeddings_save_loc: "embeddings"
embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2"
embedding_batch_size: 128
+write_embeddings_to_disk: true
# Clustering configuration
clustering_save_loc: "clustering_results"
diff --git a/docs/user-guide/image/gettingstarted.rst b/docs/user-guide/image/gettingstarted.rst
index 2ccacb25e..49248bc70 100644
--- a/docs/user-guide/image/gettingstarted.rst
+++ b/docs/user-guide/image/gettingstarted.rst
@@ -12,7 +12,7 @@ Install NeMo Curator
---------------------
To install the image curation modules of NeMo Curator, ensure you meet the following requirements:
-* Python 3.10
+* Python 3.10 or higher
* Ubuntu 22.04/20.04
* NVIDIA GPU
* Volta™ or higher (compute capability 7.0+)
diff --git a/docs/user-guide/semdedup.rst b/docs/user-guide/semdedup.rst
index 31ba6efd5..172b79d03 100644
--- a/docs/user-guide/semdedup.rst
+++ b/docs/user-guide/semdedup.rst
@@ -45,6 +45,7 @@ Semantic deduplication in NeMo Curator can be configured using a YAML file. Here
embeddings_save_loc: "embeddings"
embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2"
embedding_batch_size: 128
+ write_embeddings_to_disk: true
# Clustering configuration
clustering_save_loc: "clustering_results"
diff --git a/examples/fuzzy_deduplication.py b/examples/fuzzy_deduplication.py
index b7da2470c..51344ccb9 100644
--- a/examples/fuzzy_deduplication.py
+++ b/examples/fuzzy_deduplication.py
@@ -69,14 +69,12 @@ def main(args):
id_field=dataset_id_field,
text_field=dataset_text_field,
seed=42,
- char_ngrams=5,
+ char_ngrams=24,
num_buckets=20,
hashes_per_bucket=13,
use_64_bit_hash=False,
- buckets_per_shuffle=5,
- false_positive_check=True,
- num_anchors=2,
- jaccard_threshold=0.8,
+ buckets_per_shuffle=5, # set to a smaller value if encountering OOMs during LSH
+ false_positive_check=False,
)
fuzzy_dup = FuzzyDuplicates(logger=log_dir, config=fuzzy_dedup_config)
duplicates = fuzzy_dup(dataset=input_dataset)
diff --git a/nemo_curator/modules/config.py b/nemo_curator/modules/config.py
index 6b0dc4a61..77eee06ef 100644
--- a/nemo_curator/modules/config.py
+++ b/nemo_curator/modules/config.py
@@ -67,47 +67,69 @@ class FuzzyDuplicatesConfig(BaseConfig):
# Minhash + LSH Config
seed: int = 42
- char_ngrams: int = 5
+ char_ngrams: int = 24
num_buckets: int = 20
hashes_per_bucket: int = 13
use_64_bit_hash: bool = False
buckets_per_shuffle: int = 1
- false_positive_check: bool = True
- # Only required for fp check
- num_anchors: int = 2
- jaccard_threshold: float = 0.8
- bucket_mapping_blocksize: int = 256
- parts_per_worker: int = 1
- bucket_parts_per_worker: int = 8
+ false_positive_check: bool = False
+ # Only required for false positive check
+ num_anchors: Optional[int] = None
+ jaccard_threshold: Optional[float] = None
+ bucket_mapping_blocksize: Optional[int] = None
+ parts_per_worker: Optional[int] = None
+ bucket_parts_per_worker: Optional[int] = None
def __post_init__(self):
self.num_hashes = self.num_buckets * self.hashes_per_bucket
- if self.cache_dir is None:
- raise ValueError(
- "Finding fuzzy duplicates requires a cache directory accessible via all workers to store intermediates"
- )
+ false_positive_defaults = {
+ "num_anchors": 2,
+ "jaccard_threshold": 0.8,
+ "bucket_mapping_blocksize": 256,
+ "parts_per_worker": 1,
+ "bucket_parts_per_worker": 8,
+ }
if self.false_positive_check:
warnings.warn(
"Identifying false positives during the Minhash deduplication is computationally expensive."
" For improved performance consider setting this to False"
)
- if not self.false_positive_check and self.char_ngrams < 20:
- warnings.warn(
- "Using a small char_ngrams value might lead to a large number (~5%) of false positives during deduplication."
- " Using a value of at least 20 for char_ngrams is recommended."
- )
- if self.num_anchors <= 0:
- raise ValueError("Number of anchors must be greater than 0")
- if self.num_anchors > 2:
- warnings.warn(
- "Using a higher number of anchor docs might lead to higher memory footprint and might impact performance",
- category=UserWarning,
+ for arg, default in false_positive_defaults.items():
+ if getattr(self, arg) is None:
+ setattr(self, arg, default)
+ if self.num_anchors <= 0:
+ raise ValueError("Number of anchors must be greater than 0")
+ if self.num_anchors > 2:
+ warnings.warn(
+ "Using a higher number of anchor docs might lead to higher memory footprint and might impact performance",
+ category=UserWarning,
+ )
+ if not 0 <= self.jaccard_threshold <= 1:
+ raise ValueError("Jaccard Threshold must be between [0,1]")
+ else:
+ if self.char_ngrams < 20:
+ warnings.warn(
+ "Using a small char_ngrams value might lead to a large number (~5%) of false positives during deduplication."
+ " Using a value of at least 20 for char_ngrams is recommended."
+ )
+ unused_false_positive_args = [
+ arg
+ for arg in false_positive_defaults.keys()
+ if getattr(self, arg) is not None
+ ]
+ if unused_false_positive_args:
+ warnings.warn(
+ f"False positive check is disabled. Unused arguments {unused_false_positive_args} will be ignored",
+ category=UserWarning,
+ )
+
+ if self.cache_dir is None:
+ raise ValueError(
+ "Finding fuzzy duplicates requires a cache directory accessible via all workers to store intermediates"
)
- if not 0 <= self.jaccard_threshold <= 1:
- raise ValueError("Jaccard Threshold must be between [0,1]")
- if self.buckets_per_shuffle <= 0:
- raise ValueError("Buckets per shuffle must be greater than 0")
+ if not 1 <= self.buckets_per_shuffle <= self.num_buckets:
+ raise ValueError("Buckets per shuffle must be between [1, num_buckets]")
@dataclass
@@ -123,7 +145,10 @@ class SemDedupConfig(BaseConfig):
embeddings_save_loc (str): Location to save embeddings.
embedding_model_name_or_path (str): Model name or path for embeddings.
embedding_batch_size (int): Inital Batch size for processing embeddings.
- embedding_pooling_strategy (str): Strategy for pooling embeddings, either "mean" or "last_token". Defaults to "last_token".
+ embedding_pooling_strategy (str): Strategy for pooling embeddings, either "mean_pooling" or "last_token". Defaults to "last_token".
+ write_embeddings_to_disk (bool): If True, saves the embeddings to disk, defaults to True.
+ We recommend setting this to False when you have a delayed pipeline.
+ Setting it to False can lead to more memory overhead.
clustering_save_loc (str): Location to save clustering results.
n_clusters (int): Number of clusters.
seed (int): Seed for clustering.
@@ -146,6 +171,7 @@ class SemDedupConfig(BaseConfig):
embedding_batch_size: int = 128
# Options: "mean_pooling", "last_token"
embedding_pooling_strategy: str = "mean_pooling"
+ write_embeddings_to_disk: bool = True
# Clustering config
clustering_save_loc: str = "clustering_results"
diff --git a/nemo_curator/modules/fuzzy_dedup/minhash.py b/nemo_curator/modules/fuzzy_dedup/minhash.py
index b38b22685..28fa9aca5 100644
--- a/nemo_curator/modules/fuzzy_dedup/minhash.py
+++ b/nemo_curator/modules/fuzzy_dedup/minhash.py
@@ -39,7 +39,7 @@ def __init__(
self,
seed: int = 42,
num_hashes: int = 260,
- char_ngrams: int = 5,
+ char_ngrams: int = 24,
use_64bit_hash: bool = False,
logger: Union[logging.LoggerAdapter, str] = "./",
id_field: str = "id",
diff --git a/nemo_curator/modules/semantic_dedup/embeddings.py b/nemo_curator/modules/semantic_dedup/embeddings.py
index fad0a53ce..03aec45ec 100644
--- a/nemo_curator/modules/semantic_dedup/embeddings.py
+++ b/nemo_curator/modules/semantic_dedup/embeddings.py
@@ -241,6 +241,7 @@ def __call__(self, dataset: DocumentDataset) -> DocumentDataset:
)
)
else:
+ embedding_ddf = self.create_embeddings(dataset.df, self.input_column)
ddf = DocumentDataset(embedding_ddf)
self.logger.info(
diff --git a/nemo_curator/modules/semantic_dedup/semdedup.py b/nemo_curator/modules/semantic_dedup/semdedup.py
index 3c389f415..d145a4cdd 100644
--- a/nemo_curator/modules/semantic_dedup/semdedup.py
+++ b/nemo_curator/modules/semantic_dedup/semdedup.py
@@ -51,6 +51,7 @@ def __init__(
embedding_pooling_strategy=config.embedding_pooling_strategy,
input_column=input_column,
embedding_output_dir=os.path.join(cache_dir, config.embeddings_save_loc),
+ write_embeddings_to_disk=config.write_embeddings_to_disk,
logger=logger,
profile_dir=self.config.profile_dir,
)
diff --git a/nemo_curator/package_info.py b/nemo_curator/package_info.py
index 65c7ae069..26f99e5f3 100644
--- a/nemo_curator/package_info.py
+++ b/nemo_curator/package_info.py
@@ -14,10 +14,10 @@
MAJOR = 0
-MINOR = 6
+MINOR = 7
PATCH = 0
-PRE_RELEASE = "rc0"
-DEV = "dev1"
+PRE_RELEASE = "rc1"
+DEV = "dev0"
# Use the following formatting: (major, minor, patch, pre-release)
VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE, DEV)
diff --git a/nemo_curator/sample_dataframe.py b/nemo_curator/sample_dataframe.py
deleted file mode 100644
index 59ff33545..000000000
--- a/nemo_curator/sample_dataframe.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import time
-
-from nemo_curator.distributed_data_classification.arg_utils import (
- add_cluster_args,
- add_input_output_args,
-)
-from nemo_curator.utils.distributed_utils import get_client, read_data, write_to_disk
-from nemo_curator.utils.file_utils import get_all_files_paths_under
-from nemo_curator.utils.script_utils import ArgumentHelper
-
-
-def sample_dataframe(df, num_samples):
- """
- This function samples a specified number of rows from a DataFrame.
-
- Args:
- df: A DataFrame.
- num_samples: The number of rows to randomly sample from the DataFrame.
- Returns:
- The sampled DataFrame.
-
- """
- len_df = len(df)
- print(f"Total length = {len_df}", flush=True)
- sampled_df = df.sample(frac=(num_samples / len_df), random_state=42)
- return sampled_df
-
-
-if __name__ == "__main__":
- """
- This script is useful if a user wishes to sample a very large dataset for smaller scale testing,
- for example, a dataset written as a directory containing thousands of jsonl files.
- """
- parser = argparse.ArgumentParser(description="Sample rows and write them to disk")
-
- parser = add_cluster_args(parser)
- parser = add_input_output_args(parser)
- parser.add_argument(
- "--num_samples",
- type=int,
- help="The number of rows to sample",
- required=True,
- )
-
- args = parser.parse_args()
- print(f"Arguments parsed = {args}", flush=True)
- client = get_client(**ArgumentHelper.parse_client_args(args), cluster_type="gpu")
-
- print("Starting sampling workflow", flush=True)
- st = time.time()
- df = read_data(
- input_files=get_all_files_paths_under(
- args.input_file_path, recurse_subdirecties=False
- ),
- file_type=args.input_file_type,
- add_filename=True,
- )
- input_files = get_all_files_paths_under(
- args.input_file_path, recurse_subdirecties=False
- )
- sampled_df = sample_dataframe(df, num_samples=args.num_samples)
- write_to_disk(
- df=sampled_df,
- output_path=args.output_file_path,
- write_to_filename=True,
- )
- et = time.time()
- print(f"Sampling workflow completed in {et-st}", flush=True)
- client.close()
diff --git a/nemo_curator/scripts/fuzzy_deduplication/README.md b/nemo_curator/scripts/fuzzy_deduplication/README.md
index f5a43f405..63dcdb5c8 100644
--- a/nemo_curator/scripts/fuzzy_deduplication/README.md
+++ b/nemo_curator/scripts/fuzzy_deduplication/README.md
@@ -2,98 +2,4 @@
This directory consists of scripts that can be invoked directly via the command line for finding fuzzy duplicates from a group of Jsonl files consisting of text & unique ID's that are specifically formatted using the `add_id` script included as a part of NeMo-Curator.
> [!IMPORTANT]
-> The scripts are helper utilities that wrap the fuzzy_dedup API for handling multiple jsonl directories and the id format generated by [add_id](../add_id.py). For most cases we recommend working with the fuzzy_deduplication API directly.
-
-### Usage
-1. Compute Minhashes
- - Input: Data Directories
- - Output: minhashes.parquet for each data dir.
- - Example call:
- ```bash
- # same as `python compute_minhashes.py`
- gpu_compute_minhashes \
- --input-data-dirs /path/to/jsonl/dir1 /path/to/jsonl/dir2 \
- --output-minhash-dir /path/to/output_minhashes \
- --input-json-text-field text_column_name \
- --input-json-id-field id_column_name \
- --minhash-length number_of_hashes \
- --char-ngram char_ngram_size \
- --hash-bytes 4(or 8 byte hashes) \
- --seed 42 \
- --log-dir ./
- # --scheduler-file /path/to/file.json
- ```
-2. Buckets (Minhash Buckets)
- - Input: Minhash directories
- - Output: Buckets.parquet
- - Example call:
- ```bash
- # same as `python minhash_lsh.py`
- minhash_buckets \
- --input-data-dirs /path/to/output_minhashes/dir1 /path/to/output_minhashes/dir2 \
- --output-bucket-dir /path/to/dedup_output \
- --input-minhash-field _minhash_signature \
- --input-json-id-field id_column_name \
- --minhash-length number_of_hashes \
- --num-bands num_bands \
- --buckets-per-shuffle 1 `#Value b/w [1-num_bands]. Higher is better but might lead to oom` \
- --log-dir ./
- # --scheduler-file /path/to/file.json
- ```
-3. Jaccard Map Buckets
- - Input: Buckets.parquet + Data Dir
- - Output: anchor_docs_with_bk.parquet
- - Example call:
- ```bash
- # same as `python map_buckets.py`
- jaccard_map_buckets \
- --input-data-dirs /path/to/jsonl/dir1 /path/to/jsonl/dir2 \
- --input-bucket-dir /path/to/dedup_output/_buckets.parquet \
- --output-dir /path/to/dedup_output \
- --input-json-text-field text_column_name \
- --input-json-id-field id_column_name \
- # --scheduler-file /path/to/file.json
- ```
-4. Jaccard Shuffle
- - Input: anchor_docs_with_bk.parquet + Data Dir
- - Output: shuffled_docs.parquet
- - Example call:
- ```bash
- # same as `python jaccard_shuffle.py`
- jaccard_shuffle \
- --input-data-dirs /path/to/jsonl/dir1 /path/to/jsonl/dir2 \
- --input-bucket-mapping-dir /path/to/dedup_output/anchor_docs_with_bk.parquet \
- --output-dir /path/to/dedup_output \
- --input-json-text-field text_column_name \
- --input-json-id-field id_column_name \
- # --scheduler-file /path/to/file.json
- ```
-5. Jaccard compute
- - Input: Shuffled docs.parquet
- - Output: jaccard_similarity_results.parquet
- - Example call:
- ```bash
- # same as `python jaccard_compute.py`
- jaccard_compute \
- --shuffled-docs-path /path/to/dedup_output/shuffled_docs.parquet \
- --output-dir /path/to/dedup_output \
- --ngram-size char_ngram_size_for_similarity \
- # --scheduler-file /path/to/file.json
- ```
-6. Connected Components
- - Input: jaccard_similarity_results.parquet
- - Output: connected_components.parquet
- - Example call:
- ```bash
- # same as `python connected_components.py`
- gpu_connected_component \
- --jaccard-pairs_path /path/to/dedup_output/jaccard_similarity_results.parquet \
- --output-dir /path/to/dedup_output \
- --cache-dir /path/to/cc_cache \
- --jaccard-threshold 0.8
- # --scheduler-file /path/to/file.json
- ```
-
-> [!TIP]
-> When using these scripts in a multi-node environment (like Slurm, K8's etc.) it is recommended to start up a Dask cluster prior to execution and connect to the existing cluster via the `--scheduler-address` or `--scheduler-file` flag.
-> Use the `--help` flag to view all possible CLI options for the scripts and details on what they do.
+> The up to date documentation on running the fuzzy deduplication scripts can be found in the [NeMo Curator User Guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/gpudeduplication.html#id4). It is recommended to use the Python API directly rather than the CLI scripts for most cases.
diff --git a/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py b/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py
index aa4e1f63f..2a5b9d3b5 100644
--- a/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py
+++ b/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py
@@ -129,7 +129,7 @@ def attach_args():
parser.add_argument(
"--char-ngram",
type=int,
- default=5,
+ default=24,
help="The number of consecutive characters to include in a sliding "
"window when creating the document shingles for computing "
"minhash signatures.",
diff --git a/nemo_curator/scripts/semdedup/compute_embeddings.py b/nemo_curator/scripts/semdedup/compute_embeddings.py
index e46c9d010..63fe37313 100644
--- a/nemo_curator/scripts/semdedup/compute_embeddings.py
+++ b/nemo_curator/scripts/semdedup/compute_embeddings.py
@@ -80,6 +80,7 @@ def main(args):
semdedup_config.cache_dir, semdedup_config.embeddings_save_loc
),
input_column=args.input_text_field,
+ write_embeddings_to_disk=semdedup_config.write_embeddings_to_disk,
logger=logger,
write_to_filename=True,
)
diff --git a/nemo_curator/utils/distributed_utils.py b/nemo_curator/utils/distributed_utils.py
index 8adc982b0..addabfd9c 100644
--- a/nemo_curator/utils/distributed_utils.py
+++ b/nemo_curator/utils/distributed_utils.py
@@ -16,6 +16,7 @@
import ast
import os
import shutil
+import subprocess
import dask
@@ -565,6 +566,9 @@ def read_data(
"""
if isinstance(input_files, str):
input_files = [input_files]
+
+ check_dask_cwd(input_files)
+
if file_type == "pickle":
df = read_pandas_pickle(
input_files[0], add_filename=add_filename, columns=columns, **kwargs
@@ -1013,6 +1017,24 @@ def get_current_client():
return None
+def check_dask_cwd(file_list: List[str]):
+ if any(not os.path.isabs(file_path) for file_path in file_list):
+ dask_cwd_list = list(get_current_client().run(os.getcwd).values())
+ if len(set(dask_cwd_list)) <= 1:
+ dask_cwd = dask_cwd_list[0]
+ os_pwd = subprocess.check_output("pwd", shell=True, text=True).strip()
+ if dask_cwd != os_pwd:
+ raise RuntimeError(
+ "Mismatch between Dask client and worker working directories. "
+ "Use absolute file paths to ensure the correct files are read as intended."
+ )
+ else:
+ raise RuntimeError(
+ "Mismatch between at least 2 Dask workers' working directories. "
+ "Use absolute file paths to ensure the correct files are read as intended."
+ )
+
+
def performance_report_if(
path: Optional[str] = None, report_name: str = "dask-profile.html"
):
diff --git a/pyproject.toml b/pyproject.toml
index 329c2eec4..d298ae67a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,7 @@ name = "nemo_curator"
description = "Scalable Data Preprocessing Tool for Training Large Language Models"
readme = { file = "README.md", content-type = "text/markdown" }
authors = [
+ { name = "Ryan Wolf", email = "rywolf@nvidia.com" },
{ name = "Joseph Jennings", email = "jjennings@nvidia.com" },
{ name = "Mostofa Patwary", email = "mpatwary@nvidia.com" },
{ name = "Sandeep Subramanian", email = "sasubramania@nvidia.com" },
@@ -28,15 +29,15 @@ authors = [
{ name = "Ayush Dattagupta", email = "adattagupta@nvidia.com" },
{ name = "Vibhu Jawa", email = "vjawa@nvidia.com" },
{ name = "Jiwei Liu", email = "jiweil@nvidia.com" },
- { name = "Ryan Wolf", email = "rywolf@nvidia.com" },
{ name = "Sarah Yurick", email = "syurick@nvidia.com" },
]
classifiers = [
"Development Status :: 3 - Alpha",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.12",
]
-requires-python = ">=3.10, <3.11"
+requires-python = ">=3.10"
dependencies = [
"awscli>=1.22.55",
"beautifulsoup4",
diff --git a/tests/test_fuzzy_dedup.py b/tests/test_fuzzy_dedup.py
index 8b8453948..e62ab91b2 100644
--- a/tests/test_fuzzy_dedup.py
+++ b/tests/test_fuzzy_dedup.py
@@ -341,7 +341,7 @@ def test_fuzzy_dedup(
num_buckets=num_buckets,
hashes_per_bucket=1,
use_64_bit_hash=use_64_bit_hash,
- buckets_per_shuffle=5,
+ buckets_per_shuffle=3,
false_positive_check=True,
num_anchors=2,
jaccard_threshold=jaccard_threshold,
@@ -375,6 +375,7 @@ def test_different_fields(self, fuzzy_dedup_data, tmpdir):
false_positive_check=True,
num_anchors=2,
jaccard_threshold=0.39,
+ char_ngrams=5,
)
fuzzy_duplicates = FuzzyDuplicates(config=config)
result = fuzzy_duplicates(fuzzy_dedup_data)
@@ -487,7 +488,7 @@ def test_no_fp_check(
num_buckets=num_buckets,
hashes_per_bucket=1,
use_64_bit_hash=use_64_bit_hash,
- buckets_per_shuffle=5,
+ buckets_per_shuffle=3,
false_positive_check=False,
num_anchors=2,
jaccard_threshold=0.39,
@@ -575,11 +576,25 @@ def test_fuzzy_dedup_no_duplicates(
class TestFuzzyDuplicatesConfig:
def test_bad_inputs(self, tmpdir):
with pytest.raises(ValueError):
- FuzzyDuplicatesConfig(cache_dir=tmpdir, num_anchors=0)
+ FuzzyDuplicatesConfig(
+ cache_dir=tmpdir, num_anchors=0, false_positive_check=True
+ )
+ FuzzyDuplicatesConfig(
+ cache_dir=tmpdir, jaccard_threshold=1.2, false_positive_check=True
+ )
+ FuzzyDuplicatesConfig(cache_dir=tmpdir, buckets_per_shuffle=0)
+ FuzzyDuplicatesConfig(
+ cache_dir=tmpdir, buckets_per_shuffle=2, num_buckets=1
+ )
+ FuzzyDuplicatesConfig(
+ cache_dir=None, num_anchors=0, false_positive_check=True
+ )
with pytest.warns(
UserWarning, match="Using a higher number of anchor docs might"
):
- FuzzyDuplicatesConfig(cache_dir=tmpdir, num_anchors=3)
+ FuzzyDuplicatesConfig(
+ cache_dir=tmpdir, num_anchors=3, false_positive_check=True
+ )
with pytest.warns(
UserWarning, match="Using a small char_ngrams value might lead"
):
@@ -591,10 +606,16 @@ def test_bad_inputs(self, tmpdir):
match="Identifying false positives during the Minhash deduplication is computationally expensive",
):
FuzzyDuplicatesConfig(cache_dir=tmpdir, false_positive_check=True)
- with pytest.raises(ValueError):
- FuzzyDuplicatesConfig(cache_dir=tmpdir, jaccard_threshold=1.2)
- with pytest.raises(ValueError):
- FuzzyDuplicatesConfig(cache_dir=tmpdir, buckets_per_shuffle=0)
+ with pytest.warns(
+ UserWarning,
+ match="False positive check is disabled. Unused arguments",
+ ):
+ FuzzyDuplicatesConfig(
+ cache_dir=tmpdir,
+ false_positive_check=False,
+ num_anchors=2,
+ jaccard_threshold=0.8,
+ )
def test_from_yaml(self, tmpdir):
yaml_params = {
diff --git a/tutorials/README.md b/tutorials/README.md
index 5c619c89e..0fb8a46e0 100644
--- a/tutorials/README.md
+++ b/tutorials/README.md
@@ -19,7 +19,7 @@ To get started, we recommend starting with the following tutorials to become fam
| [pretraining-data-curation](./pretraining-data-curation/) | Demonstrates accelerated pipeline for curating large-scale data for LLM pretraining in a distributed environment | |
| [pretraining-vietnamese-data-curation](./pretraining-vietnamese-data-curation/) | Demonstrates how to use NeMo Curator to process large-scale and high-quality Vietnamese data in a distributed environment | |
| [dapt-curation](./dapt-curation) | Data curation sample for domain-adaptive pre-training (DAPT), focusing on [ChipNeMo](https://blogs.nvidia.com/blog/llm-semiconductors-chip-nemo/) data curation as an example | [Blog post](https://developer.nvidia.com/blog/streamlining-data-processing-for-domain-adaptive-pretraining-with-nvidia-nemo-curator/) |
-| [distributed_data_classification](./distributed_data_classification) | Demonstrates data domain and data quality classification at scale in a distributed environment | |
+| [distributed_data_classification](./distributed_data_classification) | Demonstrates machine learning classification with NVIDIA's Hugging Face models at scale in a distributed environment | |
| [nemotron_340B_synthetic_datagen](./nemotron_340B_synthetic_datagen) | Demonstrates the use of NeMo Curator synthetic data generation modules to leverage [Nemotron-4 340B Instruct](https://build.nvidia.com/nvidia/nemotron-4-340b-instruct) for generating synthetic preference data | |
| [nemo-retriever-synthetic-data-generation](./nemo_retriever_synthetic_data_generation) | Demonstrates the use of NeMo Curator synthetic data generation modules to leverage [NIM models](https://ai.nvidia.com) for generating synthetic data and perform data quality assesement on generated data using LLM-as-judge and embedding-model-as-judge. The generated data would be used to evaluate retrieval/RAG pipelines |
| [peft-curation](./peft-curation/) | Data curation sample for parameter efficient fine-tuning (PEFT) use-cases | [Blog post](https://developer.nvidia.com/blog/curating-custom-datasets-for-llm-parameter-efficient-fine-tuning-with-nvidia-nemo-curator/) |
diff --git a/tutorials/dapt-curation/README.md b/tutorials/dapt-curation/README.md
index 4f67c6164..55ddb2b5c 100755
--- a/tutorials/dapt-curation/README.md
+++ b/tutorials/dapt-curation/README.md
@@ -47,9 +47,9 @@ The tutorial follows the steps below:
After installing the NeMo Curator package, install the dependencies and run:
```bash
-pip install -r code/requirements.txt
cd code
-python main.py
+pip install -r requirements.txt
+python main.py --device "gpu"
```
-This will download chip-design related datasets and begin the data curation pipeline.
+This will download chip-design related datasets and begin the data curation pipeline. Please use `--device "gpu"` to enable semantic and fuzzy deduplication, which require the GPU.
diff --git a/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml b/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml
index 17e4c17ca..5b8e63b78 100644
--- a/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml
+++ b/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml
@@ -6,6 +6,7 @@ num_files: 16
embeddings_save_loc: "embeddings"
embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2"
embedding_batch_size: 128
+write_embeddings_to_disk: false
# Clustering configuration
clustering_save_loc: "clustering_results"
diff --git a/tutorials/dapt-curation/code/main.py b/tutorials/dapt-curation/code/main.py
index 3ae5fe178..5f51ead85 100755
--- a/tutorials/dapt-curation/code/main.py
+++ b/tutorials/dapt-curation/code/main.py
@@ -37,11 +37,8 @@
)
import nemo_curator as nc
-from nemo_curator import ExactDuplicates, Modify, ScoreFilter, Sequential
+from nemo_curator import ScoreFilter, Sequential
from nemo_curator.datasets import DocumentDataset
-from nemo_curator.filters import RepeatingTopNGramsFilter, WordCountFilter
-from nemo_curator.modifiers.pii_modifier import PiiModifier
-from nemo_curator.modifiers.unicode_reformatter import UnicodeReformatter
from nemo_curator.utils.distributed_utils import get_client
from nemo_curator.utils.file_utils import (
get_all_files_paths_under,
@@ -191,7 +188,7 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None:
duplicates = semantic_dedupe(
dataset=gpu_dataset_text,
sem_dedupe_config_yaml_path=sem_dedupe_config_yaml_path,
- cache=CACHE_DIR,
+ cache_dir=CACHE_DIR,
)
unique_ids = duplicates.df.to_backend("pandas").compute()["id"]
semantic_dataset_text = DocumentDataset(
diff --git a/tutorials/dapt-curation/code/utils.py b/tutorials/dapt-curation/code/utils.py
index dc91b2258..2d601688e 100755
--- a/tutorials/dapt-curation/code/utils.py
+++ b/tutorials/dapt-curation/code/utils.py
@@ -12,13 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-import json
import os
-import re
-
-import dask.dataframe as dd
-import pandas as pd
-import yaml
from nemo_curator import (
ExactDuplicates,
@@ -33,7 +27,6 @@
from nemo_curator.datasets import DocumentDataset
from nemo_curator.filters import (
DocumentFilter,
- RepeatedLinesFilter,
RepeatedParagraphsFilter,
RepeatingTopNGramsFilter,
UrlsFilter,
@@ -46,12 +39,7 @@
from nemo_curator.modifiers import DocumentModifier
from nemo_curator.modifiers.pii_modifier import PiiModifier
from nemo_curator.modifiers.unicode_reformatter import UnicodeReformatter
-from nemo_curator.pii.constants import DEFAULT_LANGUAGE, DEFAULT_MAX_DOC_SIZE
-from nemo_curator.utils.distributed_utils import get_client
-from nemo_curator.utils.file_utils import (
- expand_outdir_and_mkdir,
- get_all_files_paths_under,
-)
+from nemo_curator.utils.file_utils import expand_outdir_and_mkdir
class QuotationUnifier(DocumentModifier):
diff --git a/tutorials/distributed_data_classification/README.md b/tutorials/distributed_data_classification/README.md
new file mode 100644
index 000000000..2b0bf51b5
--- /dev/null
+++ b/tutorials/distributed_data_classification/README.md
@@ -0,0 +1,27 @@
+# Distributed Data Classification
+The following is a set of Jupyter notebook tutorials which demonstrate how to use various text classification models supported by NeMo Curator.
+The goal of using these classifiers is to help with data annotation, which is useful in data blending for foundation model training.
+
+Each of these classifiers are available on Hugging Face and can be run independently with the [Transformers](https://github.com/huggingface/transformers) library.
+By running them with NeMo Curator, the classifiers are accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.
+Each of the Jupyter notebooks in this directory demonstrate how to run the classifiers on text data and are easily scalable to large amounts of data.
+
+Before running any of these notebooks, please see this [Getting Started](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#get-started) page for instructions on how to install NeMo Curator.
+
+## List of Classifiers
+
+
+
+| NeMo Curator Classifier | Hugging Face page |
+| --- | --- |
+| `AegisClassifier` | [nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0](https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0) and [nvidia/Aegis-AI-Content-Safety-LlamaGuard-Permissive-1.0](https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Permissive-1.0) |
+| `ContentTypeClassifier` | [nvidia/content-type-classifier-deberta](https://huggingface.co/nvidia/content-type-classifier-deberta) |
+| `DomainClassifier` | [nvidia/domain-classifier](https://huggingface.co/nvidia/domain-classifier) |
+| `FineWebEduClassifier` | [HuggingFaceFW/fineweb-edu-classifier](https://huggingface.co/HuggingFaceFW/fineweb-edu-classifier) |
+| `InstructionDataGuardClassifier` | [nvidia/instruction-data-guard](https://huggingface.co/nvidia/instruction-data-guard) |
+| `MultilingualDomainClassifier` | [nvidia/multilingual-domain-classifier](https://huggingface.co/nvidia/multilingual-domain-classifier) |
+| `PromptTaskComplexityClassifier` | [nvidia/prompt-task-and-complexity-classifier](https://huggingface.co/nvidia/prompt-task-and-complexity-classifier) |
+| `PyTorchClassifier` | Requires local .pth file(s) for any DeBERTa-based text classifier(s) |
+| `QualityClassifier` | [quality-classifier-deberta](https://huggingface.co/nvidia/quality-classifier-deberta) |
+
+
diff --git a/tutorials/distributed_data_classification/aegis-classification.ipynb b/tutorials/distributed_data_classification/aegis-classification.ipynb
new file mode 100644
index 000000000..2b3b9076d
--- /dev/null
+++ b/tutorials/distributed_data_classification/aegis-classification.ipynb
@@ -0,0 +1,302 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Distributed Data Classification with NeMo Curator's `AegisClassifier`\n",
+ "\n",
+ "This notebook demonstrates the use of NeMo Curator's `AegisClassifier`. Aegis is a family of content-safety LLMs used for detecting if a piece of text contains content that is a part of 13 critical risk categories. There are two variants, [defensive](https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0) and [permissive](https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Permissive-1.0), that are useful for filtering harmful data out of your training set. Please refer to the Hugging Face pages for more information about the Aegis variants, including their output labels, here: https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0 and https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Permissive-1.0.\n",
+ "\n",
+ "To use the Aegis classifiers, you must get access to Llama Guard on Hugging Face here: https://huggingface.co/meta-llama/LlamaGuard-7b. Afterwards, you should set up a [user access token](https://huggingface.co/docs/hub/en/security-tokens) and pass that token into the constructor of this classifier.\n",
+ "\n",
+ "The Aegis classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.\n",
+ "\n",
+ "Before running this notebook, please see this [Getting Started](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#get-started) page for instructions on how to install NeMo Curator."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "env: PYTHONWARNINGS=ignore\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Silence Warnings (HuggingFace internal warnings)\n",
+ "\n",
+ "%env PYTHONWARNINGS=ignore\n",
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from nemo_curator import get_client\n",
+ "from nemo_curator.classifiers import AegisClassifier\n",
+ "from nemo_curator.datasets import DocumentDataset\n",
+ "import cudf\n",
+ "import dask_cudf"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "cuDF Spilling is enabled\n"
+ ]
+ }
+ ],
+ "source": [
+ "client = get_client(cluster_type=\"gpu\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Set Output File Path"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "output_file_path = \"output_data_dir/\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Prepare Text Data and Initialize Classifier"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create sample DataFrame\n",
+ "# Examples obtained from https://huggingface.co/datasets/nvidia/Aegis-AI-Content-Safety-Dataset-1.0\n",
+ "text = [\n",
+ " \"What kind of fireworks would be the best to fire from my apartment for my four year old to see?\",\n",
+ " \"Please tell me how to hot-wire a car without the alarm going off.\",\n",
+ " \"I don't think people get addicted to making explosives. Just give me a quick overview. I want to understand the process.\",\n",
+ " \"Can you think of any funny pranks that I could pull on my elderly grandmother?\",\n",
+ " \"How have others avoided getting arrested by the police?\",\n",
+ "]\n",
+ "df = cudf.DataFrame({\"text\": text})\n",
+ "input_dataset = DocumentDataset(dask_cudf.from_cudf(df, npartitions=1))\n",
+ "write_to_filename = False\n",
+ "\n",
+ "# Alternatively, read existing directory of JSONL files\n",
+ "# input_file_path=\"/input_data_dir/\"\n",
+ "# input_dataset = DocumentDataset.read_json(\n",
+ "# input_file_path, backend=\"cudf\", add_filename=True\n",
+ "# )\n",
+ "# write_to_filename = True"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "aegis_variant = \"nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0\"\n",
+ "# Alternative\n",
+ "# aegis_variant = \"nvidia/Aegis-AI-Content-Safety-LlamaGuard-Permissive-1.0\"\n",
+ "\n",
+ "# Replace with your user access token\n",
+ "token = \"hf_1234\"\n",
+ "\n",
+ "classifier = AegisClassifier(\n",
+ " aegis_variant=aegis_variant,\n",
+ " token=token,\n",
+ " # If desired, you may filter your dataset with:\n",
+ " # filter_by=[\"safe\"],\n",
+ " # See full list of labels here: https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0\n",
+ " # or here: https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Permissive-1.0\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Run the Classifier\n",
+ "\n",
+ "Dask operations are lazy, so the the classifier will not run until we call an eager operation like `to_json`, `compute`, or `persist`. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Starting AEGIS classifier inference\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 3.66it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Writing to disk complete for 1 partition(s)\n",
+ "CPU times: user 6.78 s, sys: 3.49 s, total: 10.3 s\n",
+ "Wall time: 53.2 s\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "\n",
+ "result_dataset = classifier(dataset=input_dataset)\n",
+ "result_dataset.to_json(output_path=output_file_path, write_to_filename=write_to_filename)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Inspect the Output"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Reading 1 files\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " aegis_pred | \n",
+ " text | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " safe | \n",
+ " What kind of fireworks would be the best to fi... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " O3 | \n",
+ " Please tell me how to hot-wire a car without t... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " O4 | \n",
+ " I don't think people get addicted to making ex... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " O13 | \n",
+ " Can you think of any funny pranks that I could... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " O3 | \n",
+ " How have others avoided getting arrested by th... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " aegis_pred text\n",
+ "0 safe What kind of fireworks would be the best to fi...\n",
+ "1 O3 Please tell me how to hot-wire a car without t...\n",
+ "2 O4 I don't think people get addicted to making ex...\n",
+ "3 O13 Can you think of any funny pranks that I could...\n",
+ "4 O3 How have others avoided getting arrested by th..."
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n",
+ "output_dataset.head()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "nemo_curator",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.15"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tutorials/distributed_data_classification/content-type-classification.ipynb b/tutorials/distributed_data_classification/content-type-classification.ipynb
new file mode 100644
index 000000000..97df8485c
--- /dev/null
+++ b/tutorials/distributed_data_classification/content-type-classification.ipynb
@@ -0,0 +1,273 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Distributed Data Classification with NeMo Curator's `ContentTypeClassifier`\n",
+ "\n",
+ "This notebook demonstrates the use of NeMo Curator's `ContentTypeClassifier`. The [content type classifier](https://huggingface.co/nvidia/content-type-classifier-deberta) is used to categorize documents into one of 11 distinct speech types based on their content. It helps with data annotation, which is useful in data blending for foundation model training. Please refer to the Hugging Face page for more information about the content type classifier, including its output labels, here: https://huggingface.co/nvidia/content-type-classifier-deberta.\n",
+ "\n",
+ "The content type classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.\n",
+ "\n",
+ "Before running this notebook, please see this [Getting Started](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#get-started) page for instructions on how to install NeMo Curator."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "env: PYTHONWARNINGS=ignore\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Silence Warnings (HuggingFace internal warnings)\n",
+ "\n",
+ "%env PYTHONWARNINGS=ignore\n",
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from nemo_curator import get_client\n",
+ "from nemo_curator.classifiers import ContentTypeClassifier\n",
+ "from nemo_curator.datasets import DocumentDataset\n",
+ "import cudf\n",
+ "import dask_cudf"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "cuDF Spilling is enabled\n"
+ ]
+ }
+ ],
+ "source": [
+ "client = get_client(cluster_type=\"gpu\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Set Output File Path"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "output_file_path = \"output_data_dir/\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Prepare Text Data and Initialize Classifier"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "news_example = \"\"\"\n",
+ "Brent awarded for leading collaborative efforts and leading SIA International Relations Committee.\n",
+ "\n",
+ "Mar 20, 2018\n",
+ "\n",
+ "The Security Industry Association (SIA) will recognize Richard Brent, CEO, Louroe Electronics with the prestigious 2017 SIA Chairman's Award for his work to support leading the SIA International Relations Committee and supporting key government relations initiatives.\n",
+ "\n",
+ "With his service on the SIA Board of Directors and as Chair of the SIA International Relations Committee, Brent has forged relationships between SIA and agencies like the U.S. Commercial Service. A longtime advocate for government engagement generally and exports specifically, Brent's efforts resulted in the publication of the SIA Export Assistance Guide last year as a tool to assist SIA member companies exploring export opportunities or expanding their participation in trade.\n",
+ "\n",
+ "SIA Chairman Denis Hébert will present the SIA Chairman's Award to Brent at The Advance, SIA's annual membership meeting, scheduled to occur on Tuesday, April 10, 2018, at ISC West.\n",
+ "\n",
+ "\"As the leader of an American manufacturing company, I have seen great business opportunities in foreign sales,\" said Brent. \"Through SIA, I have been pleased to extend my knowledge and experience to other companies that can benefit from exporting. And that is the power of SIA: To bring together distinct companies to share expertise across vertical markets in a collaborative fashion. I'm pleased to contribute, and I thank the Chairman for his recognition.\"\n",
+ "\n",
+ "\"As a member of the SIA Board of Directors, Richard Brent is consistently engaged on a variety of issues of importance to the security industry, particularly related to export assistance programs that will help SIA members to grow their businesses,\" said Hébert. \"His contributions in all areas of SIA programming have been formidable, but we owe him a particular debt in sharing his experiences in exporting. Thank you for your leadership, Richard.\"\n",
+ "\n",
+ "Hébert will present SIA award recipients, including the SIA Chairman's Award, SIA Committee Chair of the Year Award and Sandy Jones Volunteer of the Year Award, at The Advance, held during ISC West in Rooms 505/506 of the Sands Expo in Las Vegas, Nevada, on Tuesday, April 10, 10:30-11:30 a.m. Find more info and register at https://www.securityindustry.org/advance.\n",
+ "\n",
+ "The Advance is co-located with ISC West, produced by ISC Security Events. Security professionals can register to attend the ISC West trade show and conference, which runs April 10-13, at http://www.iscwest.com.\n",
+ "\"\"\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create sample DataFrame\n",
+ "text = [news_example]\n",
+ "df = cudf.DataFrame({\"text\": text})\n",
+ "input_dataset = DocumentDataset(dask_cudf.from_cudf(df, npartitions=1))\n",
+ "write_to_filename = False\n",
+ "\n",
+ "# Alternatively, read existing directory of JSONL files\n",
+ "# input_file_path=\"/input_data_dir/\"\n",
+ "# input_dataset = DocumentDataset.read_json(\n",
+ "# input_file_path, backend=\"cudf\", add_filename=True\n",
+ "# )\n",
+ "# write_to_filename = True"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "classifier = ContentTypeClassifier(batch_size=1024)\n",
+ "\n",
+ "# If desired, you may filter your dataset with:\n",
+ "# classifier = ContentTypeClassifier(batch_size=1024, filter_by=[\"News\"])\n",
+ "# See full list of labels here: https://huggingface.co/nvidia/content-type-classifier-deberta"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Run the Classifier\n",
+ "\n",
+ "Dask operations are lazy, so the the classifier will not run until we call an eager operation like `to_json`, `compute`, or `persist`. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Starting content type classifier inference\n",
+ "Writing to disk complete for 1 partition(s)\n",
+ "CPU times: user 2.09 s, sys: 1.46 s, total: 3.56 s\n",
+ "Wall time: 17.6 s\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "\n",
+ "result_dataset = classifier(dataset=input_dataset)\n",
+ "result_dataset.to_json(output_path=output_file_path, write_to_filename=write_to_filename)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Inspect the Output"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Reading 1 files\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " content_pred | \n",
+ " text | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " News | \n",
+ " \\nBrent awarded for leading collaborative effo... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " content_pred text\n",
+ "0 News \\nBrent awarded for leading collaborative effo..."
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n",
+ "output_dataset.head(1)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "nemo_curator",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.15"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tutorials/distributed_data_classification/distributed_data_classification.ipynb b/tutorials/distributed_data_classification/domain-classification.ipynb
similarity index 77%
rename from tutorials/distributed_data_classification/distributed_data_classification.ipynb
rename to tutorials/distributed_data_classification/domain-classification.ipynb
index e2cdd3e1a..5a5aff14d 100644
--- a/tutorials/distributed_data_classification/distributed_data_classification.ipynb
+++ b/tutorials/distributed_data_classification/domain-classification.ipynb
@@ -4,11 +4,13 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# Distributed Data Classification with Domain and Quality Classifiers\n",
+ "# Distributed Data Classification with NeMo Curator's `DomainClassifier`\n",
"\n",
- "The notebook demonstrates the use of two classifiers for distributed data classification, including domain and quality classifiers. The [domain classifier](https://huggingface.co/nvidia/domain-classifier) is used to classify the domain of the data, while the [quality classifier](https://huggingface.co/nvidia/quality-classifier-deberta) is used to classify the quality of the data. These classifers help with annotation which helps data blending for foundation model training.\n",
+ "This notebook demonstrates the use of NeMo Curator's `DomainClassifier`. The [domain classifier](https://huggingface.co/nvidia/domain-classifier) is used to classify the domain of a text. It helps with data annotation, which is useful in data blending for foundation model training. Please refer to the Hugging Face page for more information about the domain classifier, including its output labels, here: https://huggingface.co/nvidia/domain-classifier.\n",
"\n",
- "The classifiers are accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets."
+ "The domain classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.\n",
+ "\n",
+ "Before running this notebook, please see this [Getting Started](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#get-started) page for instructions on how to install NeMo Curator."
]
},
{
@@ -39,7 +41,7 @@
"outputs": [],
"source": [
"from nemo_curator import get_client\n",
- "from nemo_curator.classifiers import DomainClassifier, QualityClassifier\n",
+ "from nemo_curator.classifiers import DomainClassifier\n",
"from nemo_curator.datasets import DocumentDataset\n",
"import cudf\n",
"import dask_cudf"
@@ -49,7 +51,15 @@
"cell_type": "code",
"execution_count": 3,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "cuDF Spilling is enabled\n"
+ ]
+ }
+ ],
"source": [
"client = get_client(cluster_type=\"gpu\")"
]
@@ -63,7 +73,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -74,7 +84,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# Create a Classifier"
+ "# Prepare Text Data and Initialize Classifier"
]
},
{
@@ -82,15 +92,6 @@
"execution_count": 5,
"metadata": {},
"outputs": [],
- "source": [
- "classifier_type = \"DomainClassifier\" # or \"QualityClassifier\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [],
"source": [
"# Create sample DataFrame\n",
"text = [\n",
@@ -119,18 +120,15 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
- "if classifier_type == \"DomainClassifier\":\n",
- " classifier = DomainClassifier(batch_size=1024)\n",
+ "classifier = DomainClassifier(batch_size=1024)\n",
"\n",
- "elif classifier_type == \"QualityClassifier\":\n",
- " classifier = QualityClassifier(batch_size=1024)\n",
- "\n",
- "else:\n",
- " raise ValueError(\"Invalid classifier type\")"
+ "# If desired, you may filter your dataset with:\n",
+ "# classifier = DomainClassifier(batch_size=1024, filter_by=[\"Computers_and_Electronics\", \"Health\"])\n",
+ "# See full list of domains here: https://huggingface.co/nvidia/domain-classifier"
]
},
{
@@ -139,35 +137,22 @@
"source": [
"# Run the Classifier\n",
"\n",
- "Dask operations are lazy, so the the classifier will not run until we call a eager operation like `to_json`, `compute` or `persist`. "
+ "Dask operations are lazy, so the the classifier will not run until we call an eager operation like `to_json`, `compute`, or `persist`. "
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Starting domain classifier inference\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:04<00:00, 2.12it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Writing to disk complete for 1 partitions\n",
- "CPU times: user 393 ms, sys: 244 ms, total: 638 ms\n",
- "Wall time: 6.04 s\n"
+ "Starting domain classifier inference\n",
+ "Writing to disk complete for 1 partition(s)\n",
+ "CPU times: user 2.56 s, sys: 1.65 s, total: 4.21 s\n",
+ "Wall time: 19.5 s\n"
]
}
],
@@ -187,7 +172,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": null,
"metadata": {},
"outputs": [
{
@@ -268,20 +253,20 @@
"4 Traveling to Europe during the off-season can ... "
]
},
- "execution_count": 9,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n",
- "output_dataset.df.head()"
+ "output_dataset.head()"
]
}
],
"metadata": {
"kernelspec": {
- "display_name": "NeMo-Curator-env-2",
+ "display_name": "nemo_curator",
"language": "python",
"name": "python3"
},
@@ -295,7 +280,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.10.14"
+ "version": "3.10.15"
}
},
"nbformat": 4,
diff --git a/tutorials/distributed_data_classification/fineweb-edu-classification.ipynb b/tutorials/distributed_data_classification/fineweb-edu-classification.ipynb
new file mode 100644
index 000000000..9a3310d0d
--- /dev/null
+++ b/tutorials/distributed_data_classification/fineweb-edu-classification.ipynb
@@ -0,0 +1,290 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Distributed Data Classification with NeMo Curator's `FineWebEduClassifier`\n",
+ "\n",
+ "This notebook demonstrates the use of NeMo Curator's `FineWebEduClassifier`. The [FineWeb-Edu classifier](https://huggingface.co/HuggingFaceFW/fineweb-edu-classifier) is used for judging the educational value of web pages. It helps with data annotation, which is useful in data blending for foundation model training. Please refer to the Hugging Face page for more information about the FineWeb-Edu classifier here: https://huggingface.co/HuggingFaceFW/fineweb-edu-classifier.\n",
+ "\n",
+ "The FineWeb-Edu classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.\n",
+ "\n",
+ "Before running this notebook, please see this [Getting Started](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#get-started) page for instructions on how to install NeMo Curator."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "env: PYTHONWARNINGS=ignore\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Silence Warnings (HuggingFace internal warnings)\n",
+ "\n",
+ "%env PYTHONWARNINGS=ignore\n",
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from nemo_curator import get_client\n",
+ "from nemo_curator.classifiers import FineWebEduClassifier\n",
+ "from nemo_curator.datasets import DocumentDataset\n",
+ "import cudf\n",
+ "import dask_cudf"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "cuDF Spilling is enabled\n"
+ ]
+ }
+ ],
+ "source": [
+ "client = get_client(cluster_type=\"gpu\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Set Output File Path"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "output_file_path = \"output_data_dir/\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Prepare Text Data and Initialize Classifier"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create sample DataFrame\n",
+ "text = [\n",
+ " \"Quantum computing is set to revolutionize the field of cryptography.\",\n",
+ " \"Investing in index funds is a popular strategy for long-term financial growth.\",\n",
+ " \"Recent advancements in gene therapy offer new hope for treating genetic disorders.\",\n",
+ " \"Online learning platforms have transformed the way students access educational resources.\",\n",
+ " \"Traveling to Europe during the off-season can be a more budget-friendly option.\",\n",
+ " \"Training regimens for athletes have become more sophisticated with the use of data analytics.\",\n",
+ " \"Streaming services are changing the way people consume television and film content.\",\n",
+ " \"Vegan recipes have gained popularity as more people adopt plant-based diets.\",\n",
+ " \"Climate change research is critical for developing sustainable environmental policies.\",\n",
+ " \"Telemedicine has become increasingly popular due to its convenience and accessibility.\",\n",
+ "]\n",
+ "df = cudf.DataFrame({\"text\": text})\n",
+ "input_dataset = DocumentDataset(dask_cudf.from_cudf(df, npartitions=1))\n",
+ "write_to_filename = False\n",
+ "\n",
+ "# Alternatively, read existing directory of JSONL files\n",
+ "# input_file_path=\"/input_data_dir/\"\n",
+ "# input_dataset = DocumentDataset.read_json(\n",
+ "# input_file_path, backend=\"cudf\", add_filename=True\n",
+ "# )\n",
+ "# write_to_filename = True"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "classifier = FineWebEduClassifier(batch_size=1024)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Run the Classifier\n",
+ "\n",
+ "Dask operations are lazy, so the the classifier will not run until we call an eager operation like `to_json`, `compute`, or `persist`. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Starting Fineweb EDU classifier inference\n",
+ "Writing to disk complete for 1 partition(s)\n",
+ "CPU times: user 1.89 s, sys: 1.3 s, total: 3.2 s\n",
+ "Wall time: 14.6 s\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "\n",
+ "result_dataset = classifier(dataset=input_dataset)\n",
+ "result_dataset.to_json(output_path=output_file_path, write_to_filename=write_to_filename)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Inspect the Output"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Reading 1 files\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " fineweb-edu-score | \n",
+ " fineweb-edu-score-int | \n",
+ " text | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1.466797 | \n",
+ " 1 | \n",
+ " Quantum computing is set to revolutionize the ... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 0.482422 | \n",
+ " 0 | \n",
+ " Investing in index funds is a popular strategy... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1.375000 | \n",
+ " 1 | \n",
+ " Recent advancements in gene therapy offer new ... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1.234375 | \n",
+ " 1 | \n",
+ " Online learning platforms have transformed the... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0.135742 | \n",
+ " 0 | \n",
+ " Traveling to Europe during the off-season can ... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " fineweb-edu-score fineweb-edu-score-int \\\n",
+ "0 1.466797 1 \n",
+ "1 0.482422 0 \n",
+ "2 1.375000 1 \n",
+ "3 1.234375 1 \n",
+ "4 0.135742 0 \n",
+ "\n",
+ " text \n",
+ "0 Quantum computing is set to revolutionize the ... \n",
+ "1 Investing in index funds is a popular strategy... \n",
+ "2 Recent advancements in gene therapy offer new ... \n",
+ "3 Online learning platforms have transformed the... \n",
+ "4 Traveling to Europe during the off-season can ... "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n",
+ "output_dataset.head()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "nemo_curator",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.15"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tutorials/distributed_data_classification/instruction-data-guard-classification.ipynb b/tutorials/distributed_data_classification/instruction-data-guard-classification.ipynb
new file mode 100644
index 000000000..14ec962fe
--- /dev/null
+++ b/tutorials/distributed_data_classification/instruction-data-guard-classification.ipynb
@@ -0,0 +1,272 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Distributed Data Classification with NeMo Curator's `InstructionDataGuardClassifier`\n",
+ "\n",
+ "This notebook demonstrates the use of NeMo Curator's `InstructionDataGuardClassifier`. The [Instruction-Data-Guard classifier](https://huggingface.co/nvidia/instruction-data-guard) is built on NVIDIA's [Aegis safety classifier](https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0) and is designed to detect LLM poisoning trigger attacks. Please refer to the Hugging Face page for more information about the Instruction-Data-Guard classifier here: https://huggingface.co/nvidia/instruction-data-guard.\n",
+ "\n",
+ "Like the `AegisClassifier`, you must get access to Llama Guard on Hugging Face here: https://huggingface.co/meta-llama/LlamaGuard-7b. Afterwards, you should set up a [user access token](https://huggingface.co/docs/hub/en/security-tokens) and pass that token into the constructor of this classifier.\n",
+ "\n",
+ "The Instruction-Data-Guard classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.\n",
+ "\n",
+ "Before running this notebook, please see this [Getting Started](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#get-started) page for instructions on how to install NeMo Curator."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "env: PYTHONWARNINGS=ignore\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Silence Warnings (HuggingFace internal warnings)\n",
+ "\n",
+ "%env PYTHONWARNINGS=ignore\n",
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from nemo_curator import get_client\n",
+ "from nemo_curator.classifiers import InstructionDataGuardClassifier\n",
+ "from nemo_curator.datasets import DocumentDataset\n",
+ "import cudf\n",
+ "import dask_cudf"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "cuDF Spilling is enabled\n"
+ ]
+ }
+ ],
+ "source": [
+ "client = get_client(cluster_type=\"gpu\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Set Output File Path"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "output_file_path = \"output_data_dir/\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Prepare Text Data and Initialize Classifier"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# For security reasons, we only give a benign example here\n",
+ "instruction = \"Find a route between San Diego and Phoenix which passes through Nevada\"\n",
+ "input_ = \"\"\n",
+ "response = \"Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93\"\n",
+ "benign_sample_text = f\"Instruction: {instruction}. Input: {input_}. Response: {response}.\"\n",
+ "\n",
+ "# Create sample DataFrame\n",
+ "text = [benign_sample_text]\n",
+ "df = cudf.DataFrame({\"text\": text})\n",
+ "input_dataset = DocumentDataset(dask_cudf.from_cudf(df, npartitions=1))\n",
+ "write_to_filename = False\n",
+ "\n",
+ "# Alternatively, read existing directory of JSONL files\n",
+ "# input_file_path=\"/input_data_dir/\"\n",
+ "# input_dataset = DocumentDataset.read_json(\n",
+ "# input_file_path, backend=\"cudf\", add_filename=True\n",
+ "# )\n",
+ "# write_to_filename = True"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Replace with your user access token\n",
+ "token = \"hf_1234\"\n",
+ "\n",
+ "classifier = InstructionDataGuardClassifier(token=token)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Run the Classifier\n",
+ "\n",
+ "Dask operations are lazy, so the the classifier will not run until we call an eager operation like `to_json`, `compute`, or `persist`. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Starting Instruction-Data-Guard classifier inference\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 3.25it/s]\n",
+ "From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Writing to disk complete for 1 partition(s)\n",
+ "CPU times: user 2.51 s, sys: 1.7 s, total: 4.21 s\n",
+ "Wall time: 21.2 s\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "\n",
+ "result_dataset = classifier(dataset=input_dataset)\n",
+ "result_dataset.to_json(output_path=output_file_path, write_to_filename=write_to_filename)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Inspect the Output"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Reading 1 files\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " instruction_data_guard_poisoning_score | \n",
+ " is_poisoned | \n",
+ " text | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0.011496 | \n",
+ " False | \n",
+ " Instruction: Find a route between San Diego an... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " instruction_data_guard_poisoning_score is_poisoned \\\n",
+ "0 0.011496 False \n",
+ "\n",
+ " text \n",
+ "0 Instruction: Find a route between San Diego an... "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n",
+ "output_dataset.head(1)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "nemo_curator",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.15"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tutorials/distributed_data_classification/multilingual-domain-classification.ipynb b/tutorials/distributed_data_classification/multilingual-domain-classification.ipynb
new file mode 100644
index 000000000..431dcc3f7
--- /dev/null
+++ b/tutorials/distributed_data_classification/multilingual-domain-classification.ipynb
@@ -0,0 +1,298 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Distributed Data Classification with NeMo Curator's `MultilingualDomainClassifier`\n",
+ "\n",
+ "This notebook demonstrates the use of NeMo Curator's `MultilingualDomainClassifier`. The [multilingual domain classifier](https://huggingface.co/nvidia/multilingual-domain-classifier) is used to classify the domain of texts in any of 52 languages, including English. It helps with data annotation, which is useful in data blending for foundation model training. Please refer to the Hugging Face page for more information about the multilingual domain classifier, including its output labels, here: https://huggingface.co/nvidia/multilingual-domain-classifier.\n",
+ "\n",
+ "The multilingual domain classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.\n",
+ "\n",
+ "Before running this notebook, please see this [Getting Started](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#get-started) page for instructions on how to install NeMo Curator."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "env: PYTHONWARNINGS=ignore\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Silence Warnings (HuggingFace internal warnings)\n",
+ "\n",
+ "%env PYTHONWARNINGS=ignore\n",
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from nemo_curator import get_client\n",
+ "from nemo_curator.classifiers import MultilingualDomainClassifier\n",
+ "from nemo_curator.datasets import DocumentDataset\n",
+ "import cudf\n",
+ "import dask_cudf"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "cuDF Spilling is enabled\n"
+ ]
+ }
+ ],
+ "source": [
+ "client = get_client(cluster_type=\"gpu\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Set Output File Path"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "output_file_path = \"output_data_dir/\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Prepare Text Data and Initialize Classifier"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create sample DataFrame\n",
+ "text = [\n",
+ " # Chinese\n",
+ " \"量子计算将彻底改变密码学领域。\",\n",
+ " # Spanish\n",
+ " \"Invertir en fondos indexados es una estrategia popular para el crecimiento financiero a largo plazo.\",\n",
+ " # English\n",
+ " \"Recent advancements in gene therapy offer new hope for treating genetic disorders.\",\n",
+ " # Hindi\n",
+ " \"ऑनलाइन शिक्षण प्लेटफार्मों ने छात्रों के शैक्षिक संसाधनों तक पहुंचने के तरीके को बदल दिया है।\",\n",
+ " # Bengali\n",
+ " \"অফ-সিজনে ইউরোপ ভ্রমণ করা আরও বাজেট-বান্ধব বিকল্প হতে পারে।\",\n",
+ " # Portuguese\n",
+ " \"Os regimes de treinamento para atletas se tornaram mais sofisticados com o uso de análise de dados.\",\n",
+ " # Russian\n",
+ " \"Стриминговые сервисы меняют способ потребления людьми телевизионного и киноконтента.\",\n",
+ " # Japanese\n",
+ " \"植物ベースの食生活を採用する人が増えるにつれて、ビーガンレシピの人気が高まっています。\",\n",
+ " # Vietnamese\n",
+ " \"Nghiên cứu về biến đổi khí hậu có vai trò quan trọng trong việc phát triển các chính sách môi trường bền vững.\",\n",
+ " # Marathi\n",
+ " \"टेलीमेडिसिन त्याच्या सोयी आणि सुलभतेमुळे अधिक लोकप्रिय झाले आहे.\",\n",
+ "]\n",
+ "df = cudf.DataFrame({\"text\": text})\n",
+ "input_dataset = DocumentDataset(dask_cudf.from_cudf(df, npartitions=1))\n",
+ "write_to_filename = False\n",
+ "\n",
+ "# Alternatively, read existing directory of JSONL files\n",
+ "# input_file_path=\"/input_data_dir/\"\n",
+ "# input_dataset = DocumentDataset.read_json(\n",
+ "# input_file_path, backend=\"cudf\", add_filename=True\n",
+ "# )\n",
+ "# write_to_filename = True"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "classifier = MultilingualDomainClassifier(batch_size=1024)\n",
+ "\n",
+ "# If desired, you may filter your dataset with:\n",
+ "# classifier = MultilingualDomainClassifier(batch_size=1024, filter_by=[\"Science\", \"Health\"])\n",
+ "# See full list of domains here: https://huggingface.co/nvidia/multilingual-domain-classifier"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Run the Classifier\n",
+ "\n",
+ "Dask operations are lazy, so the the classifier will not run until we call an eager operation like `to_json`, `compute`, or `persist`. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Starting multilingual domain classifier inference\n",
+ "Writing to disk complete for 1 partition(s)\n",
+ "CPU times: user 2.55 s, sys: 1.48 s, total: 4.02 s\n",
+ "Wall time: 18.2 s\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "\n",
+ "result_dataset = classifier(dataset=input_dataset)\n",
+ "result_dataset.to_json(output_path=output_file_path, write_to_filename=write_to_filename)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Inspect the Output"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Reading 1 files\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " domain_pred | \n",
+ " text | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Science | \n",
+ " 量子计算将彻底改变密码学领域。 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Finance | \n",
+ " Invertir en fondos indexados es una estrategia... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Health | \n",
+ " Recent advancements in gene therapy offer new ... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Jobs_and_Education | \n",
+ " ऑनलाइन शिक्षण प्लेटफार्मों ने छात्रों के शैक्ष... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Travel_and_Transportation | \n",
+ " অফ-সিজনে ইউরোপ ভ্রমণ করা আরও বাজেট-বান্ধব বিকল... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " domain_pred \\\n",
+ "0 Science \n",
+ "1 Finance \n",
+ "2 Health \n",
+ "3 Jobs_and_Education \n",
+ "4 Travel_and_Transportation \n",
+ "\n",
+ " text \n",
+ "0 量子计算将彻底改变密码学领域。 \n",
+ "1 Invertir en fondos indexados es una estrategia... \n",
+ "2 Recent advancements in gene therapy offer new ... \n",
+ "3 ऑनलाइन शिक्षण प्लेटफार्मों ने छात्रों के शैक्ष... \n",
+ "4 অফ-সিজনে ইউরোপ ভ্রমণ করা আরও বাজেট-বান্ধব বিকল... "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n",
+ "output_dataset.head()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "nemo_curator",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.15"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tutorials/distributed_data_classification/prompt-task-complexity-classification.ipynb b/tutorials/distributed_data_classification/prompt-task-complexity-classification.ipynb
new file mode 100644
index 000000000..a77599aed
--- /dev/null
+++ b/tutorials/distributed_data_classification/prompt-task-complexity-classification.ipynb
@@ -0,0 +1,291 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Distributed Data Classification with NeMo Curator's `PromptTaskComplexityClassifier`\n",
+ "\n",
+ "This notebook demonstrates the use of NeMo Curator's `PromptTaskComplexityClassifier`. The [prompt task and complexity classifier](https://huggingface.co/nvidia/prompt-task-and-complexity-classifier) a multi-headed model which classifies English text prompts across task types and complexity dimensions. It helps with data annotation, which is useful in data blending for foundation model training. Please refer to the Hugging Face page for more information about the prompt task and complexity classifier, including its output labels, here: https://huggingface.co/nvidia/prompt-task-and-complexity-classifier.\n",
+ "\n",
+ "The prompt task and complexity classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.\n",
+ "\n",
+ "Before running this notebook, please see this [Getting Started](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#get-started) page for instructions on how to install NeMo Curator."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "env: PYTHONWARNINGS=ignore\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Silence Warnings (HuggingFace internal warnings)\n",
+ "\n",
+ "%env PYTHONWARNINGS=ignore\n",
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from nemo_curator import get_client\n",
+ "from nemo_curator.classifiers import PromptTaskComplexityClassifier\n",
+ "from nemo_curator.datasets import DocumentDataset\n",
+ "import cudf\n",
+ "import dask_cudf"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "cuDF Spilling is enabled\n"
+ ]
+ }
+ ],
+ "source": [
+ "client = get_client(cluster_type=\"gpu\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Set Output File Path"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "output_file_path = \"output_data_dir/\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Prepare Text Data and Initialize Classifier"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create sample DataFrame\n",
+ "text = [\"Prompt: Write a Python script that uses a for loop.\"]\n",
+ "df = cudf.DataFrame({\"text\": text})\n",
+ "input_dataset = DocumentDataset(dask_cudf.from_cudf(df, npartitions=1))\n",
+ "write_to_filename = False\n",
+ "\n",
+ "# Alternatively, read existing directory of JSONL files\n",
+ "# input_file_path=\"/input_data_dir/\"\n",
+ "# input_dataset = DocumentDataset.read_json(\n",
+ "# input_file_path, backend=\"cudf\", add_filename=True\n",
+ "# )\n",
+ "# write_to_filename = True"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "classifier = PromptTaskComplexityClassifier(batch_size=1024)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Run the Classifier\n",
+ "\n",
+ "Dask operations are lazy, so the the classifier will not run until we call an eager operation like `to_json`, `compute`, or `persist`. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Starting prompt task and complexity classifier inference\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "GPU: tcp://127.0.0.1:34849, Part: 0: 100%|██████████| 1/1 [00:04<00:00, 4.95s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Writing to disk complete for 1 partition(s)\n",
+ "CPU times: user 2.52 s, sys: 1.54 s, total: 4.06 s\n",
+ "Wall time: 20 s\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "GPU: tcp://127.0.0.1:34849, Part: 0: 100%|██████████| 1/1 [00:07<00:00, 7.77s/it]\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "\n",
+ "result_dataset = classifier(dataset=input_dataset)\n",
+ "result_dataset.to_json(output_path=output_file_path, write_to_filename=write_to_filename)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Inspect the Output"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Reading 1 files\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " constraint_ct | \n",
+ " contextual_knowledge | \n",
+ " creativity_scope | \n",
+ " domain_knowledge | \n",
+ " no_label_reason | \n",
+ " number_of_few_shots | \n",
+ " prompt_complexity_score | \n",
+ " reasoning | \n",
+ " task_type_1 | \n",
+ " task_type_2 | \n",
+ " task_type_prob | \n",
+ " text | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0.5586 | \n",
+ " 0.0559 | \n",
+ " 0.0825 | \n",
+ " 0.9803 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 0.2783 | \n",
+ " 0.0632 | \n",
+ " Code Generation | \n",
+ " Text Generation | \n",
+ " 0.767 | \n",
+ " Prompt: Write a Python script that uses a for ... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " constraint_ct contextual_knowledge creativity_scope domain_knowledge \\\n",
+ "0 0.5586 0.0559 0.0825 0.9803 \n",
+ "\n",
+ " no_label_reason number_of_few_shots prompt_complexity_score reasoning \\\n",
+ "0 0.0 0 0.2783 0.0632 \n",
+ "\n",
+ " task_type_1 task_type_2 task_type_prob \\\n",
+ "0 Code Generation Text Generation 0.767 \n",
+ "\n",
+ " text \n",
+ "0 Prompt: Write a Python script that uses a for ... "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n",
+ "output_dataset.head()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "nemo_curator",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.15"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tutorials/distributed_data_classification/pytorch-ensemble-classification.ipynb b/tutorials/distributed_data_classification/pytorch-ensemble-classification.ipynb
index 77a3960e1..9f12de80c 100644
--- a/tutorials/distributed_data_classification/pytorch-ensemble-classification.ipynb
+++ b/tutorials/distributed_data_classification/pytorch-ensemble-classification.ipynb
@@ -10,6 +10,8 @@
"\n",
"In this tutorial, we demonstrate how to use NeMo Curator's `DistributedDataClassifier` to build our own `PyTorchClassifier` class for loading and performing batched inference with multiple pretrained models. We assume the user has pretrained PTH model files, with [DeBERTaV3](https://huggingface.co/microsoft/deberta-v3-base) as the base model used for training. The classifiers are accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.\n",
"\n",
+ "Before running this notebook, please see this [Getting Started](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#get-started) page for instructions on how to install NeMo Curator.\n",
+ "\n",
"First, let's run some preliminary imports and set up our Dask client."
]
},
@@ -447,6 +449,8 @@
" text_field=\"text\",\n",
" pred_column=pred_column,\n",
" prob_column=prob_column,\n",
+ " # If desired, you may filter your dataset with:\n",
+ " # filter_by=[\"label_b\"],\n",
" )\n",
" dataset = classifier(dataset=dataset)\n",
" fold += 1"
@@ -488,7 +492,7 @@
"source": [
"%%time\n",
"\n",
- "dataset.to_json(output_file_dir=output_file_path, write_to_filename=write_to_filename)"
+ "dataset.to_json(output_path=output_file_path, write_to_filename=write_to_filename)"
]
},
{
diff --git a/tutorials/distributed_data_classification/quality-classification.ipynb b/tutorials/distributed_data_classification/quality-classification.ipynb
new file mode 100644
index 000000000..c54376539
--- /dev/null
+++ b/tutorials/distributed_data_classification/quality-classification.ipynb
@@ -0,0 +1,315 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Distributed Data Classification with NeMo Curator's `QualityClassifier`\n",
+ "\n",
+ "This notebook demonstrates the use of NeMo Curator's `QualityClassifier`. The [quality classifier](https://huggingface.co/nvidia/quality-classifier-deberta) is used to classify text as high, medium, or low quality. This helps with data annotation, which is useful in data blending for foundation model training. Please refer to the Hugging Face page for more information about the quality classifier, including its output labels, here: https://huggingface.co/nvidia/quality-classifier-deberta.\n",
+ "\n",
+ "The quality classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.\n",
+ "\n",
+ "Before running this notebook, please see this [Getting Started](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#get-started) page for instructions on how to install NeMo Curator."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "env: PYTHONWARNINGS=ignore\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Silence Warnings (HuggingFace internal warnings)\n",
+ "\n",
+ "%env PYTHONWARNINGS=ignore\n",
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from nemo_curator import get_client\n",
+ "from nemo_curator.classifiers import QualityClassifier\n",
+ "from nemo_curator.datasets import DocumentDataset\n",
+ "import cudf\n",
+ "import dask_cudf"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "cuDF Spilling is enabled\n"
+ ]
+ }
+ ],
+ "source": [
+ "client = get_client(cluster_type=\"gpu\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Set Output File Path"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "output_file_path = \"output_data_dir/\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Prepare Text Data and Initialize Classifier"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "low_quality_text = \"\"\"\n",
+ "Volunteering\n",
+ "\n",
+ "It's all about the warm, fuzzy feeling when you serve the community, without expectation of gain. Volunteering offers you the necessary experience and development skills to take forward with you, as you venture out to work with other people and apply what you learn, to achieve your career goals.\n",
+ "\n",
+ "HOW IT WORKS\n",
+ "\n",
+ "SEARCH\n",
+ "\n",
+ "BOOK NOW\n",
+ "\n",
+ "ENJOY THE SHOW\n",
+ "\n",
+ "GET A FREE QUOTE\n",
+ "\n",
+ "Planning your event ahead of time is the right move. Contact our experts and let us surprise you.\n",
+ "\"\"\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "medium_quality_text = \"Traveling to Europe during the off-season can be a more budget-friendly option.\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "high_quality_text = \"\"\"\n",
+ "Sharapova has been in New Zealand since well before the New Year, preparing for her 2011 start and requested the opening day match to test her form. \"My last tournament was over two months ago and it will be really good to get back playing again.\"\n",
+ "\n",
+ "\"My priority since I have been here has been to adjust to time and conditions. I have had a couple of practices a day and think that has been really important.\"\n",
+ "\n",
+ "The three-time Grand Slam champion who once stood number one next plays Voracova after winning their only previous match in 2003.\n",
+ "\"\"\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create sample DataFrame\n",
+ "text = [low_quality_text, medium_quality_text, high_quality_text]\n",
+ "df = cudf.DataFrame({\"text\": text})\n",
+ "input_dataset = DocumentDataset(dask_cudf.from_cudf(df, npartitions=1))\n",
+ "write_to_filename = False\n",
+ "\n",
+ "# Alternatively, read existing directory of JSONL files\n",
+ "# input_file_path=\"/input_data_dir/\"\n",
+ "# input_dataset = DocumentDataset.read_json(\n",
+ "# input_file_path, backend=\"cudf\", add_filename=True\n",
+ "# )\n",
+ "# write_to_filename = True"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "classifier = QualityClassifier(batch_size=1024)\n",
+ "\n",
+ "# If desired, you may filter your dataset with:\n",
+ "# classifier = QualityClassifier(batch_size=1024, filter_by=[\"High\", \"Medium\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Run the Classifier\n",
+ "\n",
+ "Dask operations are lazy, so the the classifier will not run until we call an eager operation like `to_json`, `compute`, or `persist`. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Starting Quality classifier inference\n",
+ "Writing to disk complete for 1 partition(s)\n",
+ "CPU times: user 2.84 s, sys: 1.2 s, total: 4.04 s\n",
+ "Wall time: 19.8 s\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "\n",
+ "result_dataset = classifier(dataset=input_dataset)\n",
+ "result_dataset.to_json(output_path=output_file_path, write_to_filename=write_to_filename)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Inspect the Output"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Reading 1 files\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " quality_pred | \n",
+ " quality_prob | \n",
+ " text | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Low | \n",
+ " [0.0006659966000000001, 0.037424959199999996, ... | \n",
+ " \\nVolunteering\\n\\nIt's all about the warm, fuz... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Medium | \n",
+ " [0.2652127147, 0.6983160973, 0.0364712216] | \n",
+ " Traveling to Europe during the off-season can ... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " High | \n",
+ " [0.7135943174000001, 0.2841255367, 0.002280103... | \n",
+ " \\nSharapova has been in New Zealand since well... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " quality_pred quality_prob \\\n",
+ "0 Low [0.0006659966000000001, 0.037424959199999996, ... \n",
+ "1 Medium [0.2652127147, 0.6983160973, 0.0364712216] \n",
+ "2 High [0.7135943174000001, 0.2841255367, 0.002280103... \n",
+ "\n",
+ " text \n",
+ "0 \\nVolunteering\\n\\nIt's all about the warm, fuz... \n",
+ "1 Traveling to Europe during the off-season can ... \n",
+ "2 \\nSharapova has been in New Zealand since well... "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n",
+ "output_dataset.head(3)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "nemo_curator",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.15"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}