Merge in main

Signed-off-by: Vibhu Jawa <[email protected]>
NVIDIA · Feb 5, 2025 · 3cb6354 · 3cb6354
2 parents ac7a7fc + c3fb61d
commit 3cb6354
Show file tree

Hide file tree

Showing 37 changed files with 2,248 additions and 325 deletions.
diff --git a/.github/workflows/build-test-publish-wheel.yml b/.github/workflows/build-test-publish-wheel.yml
@@ -27,20 +27,12 @@ defaults:
 
 jobs:
   build-test-publish-wheel:
-    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_build_test_publish_wheel.yml@v0.7.0
+    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_build_test_publish_wheel.yml@v0.20.0
     with:
-      image-name: nemo_curator_container
-      dockerfile: Dockerfile
-      image-label: nemo-curator
-      build-args: |
-        IMAGE_LABEL=nemo-curator
-        REPO_URL=https://github.com/${{ github.repository }}.git
-        CURATOR_COMMIT=${{ github.sha }}
-      prune-filter-timerange: 24h
       dry-run: true
       python-package: nemo_curator
-      container-workdir: /opt/NeMo-Curator/
       environment: public
+      python-version: '3.10'
     secrets:
       TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
       TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -31,19 +31,11 @@ on:
         description: Branch to target for version bump
 jobs:
   release:
-    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.18.4
+    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.20.1
     with:
       release-ref: ${{ inputs.release-ref }}
-      image-name: nemo_curator_container
-      dockerfile: Dockerfile
-      image-label: nemo-curator
-      build-args: |
-        IMAGE_LABEL=nemo-curator
-        REPO_URL=https://github.com/${{ github.repository }}.git
-        CURATOR_COMMIT=${{ inputs.release-ref }}
-      prune-filter-timerange: 24h
       python-package: nemo_curator
-      container-workdir: /opt/NeMo-Curator
+      python-version: '3.10'
       library-name: NeMo Curator
       dry-run: ${{ inputs.dry-run }}
       version-bump-branch: ${{ inputs.version-bump-branch }}

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -19,7 +19,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-latest]
-        python-version: ["3.10"]
+        python-version: ["3.10", "3.12"]
     steps:
       - uses: actions/checkout@v4
       - name: Optionally free up space on Ubuntu

diff --git a/Dockerfile b/Dockerfile
@@ -1,8 +1,8 @@
 # See https://github.com/rapidsai/ci-imgs for ARG options
-# NeMo Curator requires Python 3.10, Ubuntu 22.04/20.04, and CUDA 12 (or above)
+# NeMo Curator requires Python 3.12, Ubuntu 22.04/20.04, and CUDA 12 (or above)
 ARG CUDA_VER=12.5.1
 ARG LINUX_VER=ubuntu22.04
-ARG PYTHON_VER=3.10
+ARG PYTHON_VER=3.12
 ARG IMAGE_LABEL
 ARG REPO_URL
 ARG CURATOR_COMMIT
@@ -33,7 +33,7 @@ ARG CUDA_VER
 
 # Install the minimal libcu* libraries needed by NeMo Curator
 RUN conda create -y --name curator -c nvidia/label/cuda-${CUDA_VER} -c conda-forge \
-  python=3.10 \
+  python=3.12 \
   cuda-cudart \
   libcufft \
   libcublas \

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+include LICENSE
diff --git a/README.md b/README.md
@@ -69,7 +69,7 @@ This section explains how to install NeMo Curator and use the Python library, Py
 
 Before installing NeMo Curator, ensure that the following requirements are met:
 
-- Python 3.10
+- Python 3.10 or higher
 - Ubuntu 22.04/20.04
 - NVIDIA GPU (optional)
   - Volta™ or higher ([compute capability 7.0+](https://developer.nvidia.com/cuda-gpus))
@@ -158,7 +158,7 @@ To get started with NeMo Curator, you can follow the tutorials [available here](
 
 - [`tinystories`](https://github.com/NVIDIA/NeMo-Curator/tree/main/tutorials/tinystories) which focuses on data curation for training LLMs from scratch.
 - [`peft-curation`](https://github.com/NVIDIA/NeMo-Curator/tree/main/tutorials/peft-curation) which focuses on data curation for LLM parameter-efficient fine-tuning (PEFT) use-cases.
-- [`distributed_data_classification`](https://github.com/NVIDIA/NeMo-Curator/tree/main/tutorials/distributed_data_classification) which focuses on using the domain and quality classifiers to help with data annotation.
+- [`distributed_data_classification`](https://github.com/NVIDIA/NeMo-Curator/tree/main/tutorials/distributed_data_classification) which demonstrates how to use NVIDIA's Hugging Face classifiers to help with data annotation.
 - [`single_node_tutorial`](https://github.com/NVIDIA/NeMo-Curator/tree/main/tutorials/single_node_tutorial) which demonstrates an end-to-end data curation pipeline for curating Wikipedia data in Thai.
 - [`image-curation`](https://github.com/NVIDIA/NeMo-Curator/blob/main/tutorials/image-curation/image-curation.ipynb) which explores the scalable image curation modules.
 

diff --git a/config/sem_dedup_config.yaml b/config/sem_dedup_config.yaml
@@ -6,6 +6,7 @@ num_files: 16
 embeddings_save_loc: "embeddings"
 embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2"
 embedding_batch_size: 128
+write_embeddings_to_disk: true
 
 # Clustering configuration
 clustering_save_loc: "clustering_results"

diff --git a/docs/user-guide/image/gettingstarted.rst b/docs/user-guide/image/gettingstarted.rst
@@ -12,7 +12,7 @@ Install NeMo Curator
 ---------------------
 To install the image curation modules of NeMo Curator, ensure you meet the following requirements:
 
-* Python 3.10
+* Python 3.10 or higher
 * Ubuntu 22.04/20.04
 * NVIDIA GPU
   * Volta™ or higher (compute capability 7.0+)

diff --git a/docs/user-guide/semdedup.rst b/docs/user-guide/semdedup.rst
@@ -45,6 +45,7 @@ Semantic deduplication in NeMo Curator can be configured using a YAML file. Here
     embeddings_save_loc: "embeddings"
     embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2"
     embedding_batch_size: 128
+    write_embeddings_to_disk: true
 
     # Clustering configuration
     clustering_save_loc: "clustering_results"

diff --git a/examples/fuzzy_deduplication.py b/examples/fuzzy_deduplication.py
@@ -69,14 +69,12 @@ def main(args):
             id_field=dataset_id_field,
             text_field=dataset_text_field,
             seed=42,
-            char_ngrams=5,
+            char_ngrams=24,
             num_buckets=20,
             hashes_per_bucket=13,
             use_64_bit_hash=False,
-            buckets_per_shuffle=5,
-            false_positive_check=True,
-            num_anchors=2,
-            jaccard_threshold=0.8,
+            buckets_per_shuffle=5,  # set to a smaller value if encountering OOMs during LSH
+            false_positive_check=False,
         )
         fuzzy_dup = FuzzyDuplicates(logger=log_dir, config=fuzzy_dedup_config)
         duplicates = fuzzy_dup(dataset=input_dataset)

diff --git a/nemo_curator/modules/config.py b/nemo_curator/modules/config.py
@@ -67,47 +67,69 @@ class FuzzyDuplicatesConfig(BaseConfig):
 
     # Minhash + LSH Config
     seed: int = 42
-    char_ngrams: int = 5
+    char_ngrams: int = 24
     num_buckets: int = 20
     hashes_per_bucket: int = 13
     use_64_bit_hash: bool = False
     buckets_per_shuffle: int = 1
 
-    false_positive_check: bool = True
-    # Only required for fp check
-    num_anchors: int = 2
-    jaccard_threshold: float = 0.8
-    bucket_mapping_blocksize: int = 256
-    parts_per_worker: int = 1
-    bucket_parts_per_worker: int = 8
+    false_positive_check: bool = False
+    # Only required for false positive check
+    num_anchors: Optional[int] = None
+    jaccard_threshold: Optional[float] = None
+    bucket_mapping_blocksize: Optional[int] = None
+    parts_per_worker: Optional[int] = None
+    bucket_parts_per_worker: Optional[int] = None
 
     def __post_init__(self):
         self.num_hashes = self.num_buckets * self.hashes_per_bucket
-        if self.cache_dir is None:
-            raise ValueError(
-                "Finding fuzzy duplicates requires a cache directory accessible via all workers to store intermediates"
-            )
+        false_positive_defaults = {
+            "num_anchors": 2,
+            "jaccard_threshold": 0.8,
+            "bucket_mapping_blocksize": 256,
+            "parts_per_worker": 1,
+            "bucket_parts_per_worker": 8,
+        }
         if self.false_positive_check:
             warnings.warn(
                 "Identifying false positives during the Minhash deduplication is computationally expensive."
                 " For improved performance consider setting this to False"
             )
-        if not self.false_positive_check and self.char_ngrams < 20:
-            warnings.warn(
-                "Using a small char_ngrams value might lead to a large number (~5%) of false positives during deduplication."
-                " Using a value of at least 20 for char_ngrams is recommended."
-            )
-        if self.num_anchors <= 0:
-            raise ValueError("Number of anchors must be greater than 0")
-        if self.num_anchors > 2:
-            warnings.warn(
-                "Using a higher number of anchor docs might lead to higher memory footprint and might impact performance",
-                category=UserWarning,
+            for arg, default in false_positive_defaults.items():
+                if getattr(self, arg) is None:
+                    setattr(self, arg, default)
+            if self.num_anchors <= 0:
+                raise ValueError("Number of anchors must be greater than 0")
+            if self.num_anchors > 2:
+                warnings.warn(
+                    "Using a higher number of anchor docs might lead to higher memory footprint and might impact performance",
+                    category=UserWarning,
+                )
+            if not 0 <= self.jaccard_threshold <= 1:
+                raise ValueError("Jaccard Threshold must be between [0,1]")
+        else:
+            if self.char_ngrams < 20:
+                warnings.warn(
+                    "Using a small char_ngrams value might lead to a large number (~5%) of false positives during deduplication."
+                    " Using a value of at least 20 for char_ngrams is recommended."
+                )
+            unused_false_positive_args = [
+                arg
+                for arg in false_positive_defaults.keys()
+                if getattr(self, arg) is not None
+            ]
+            if unused_false_positive_args:
+                warnings.warn(
+                    f"False positive check is disabled. Unused arguments {unused_false_positive_args} will be ignored",
+                    category=UserWarning,
+                )
+
+        if self.cache_dir is None:
+            raise ValueError(
+                "Finding fuzzy duplicates requires a cache directory accessible via all workers to store intermediates"
             )
-        if not 0 <= self.jaccard_threshold <= 1:
-            raise ValueError("Jaccard Threshold must be between [0,1]")
-        if self.buckets_per_shuffle <= 0:
-            raise ValueError("Buckets per shuffle must be greater than 0")
+        if not 1 <= self.buckets_per_shuffle <= self.num_buckets:
+            raise ValueError("Buckets per shuffle must be between [1, num_buckets]")
 
 
 @dataclass
@@ -123,7 +145,10 @@ class SemDedupConfig(BaseConfig):
         embeddings_save_loc (str): Location to save embeddings.
         embedding_model_name_or_path (str): Model name or path for embeddings.
         embedding_batch_size (int): Inital Batch size for processing embeddings.
-        embedding_pooling_strategy (str): Strategy for pooling embeddings, either "mean" or "last_token". Defaults to "last_token".
+        embedding_pooling_strategy (str): Strategy for pooling embeddings, either "mean_pooling" or "last_token". Defaults to "last_token".
+        write_embeddings_to_disk (bool): If True, saves the embeddings to disk, defaults to True.
+            We recommend setting this to False when you have a delayed pipeline.
+            Setting it to False can lead to more memory overhead.
         clustering_save_loc (str): Location to save clustering results.
         n_clusters (int): Number of clusters.
         seed (int): Seed for clustering.
@@ -146,6 +171,7 @@ class SemDedupConfig(BaseConfig):
     embedding_batch_size: int = 128
     # Options: "mean_pooling", "last_token"
     embedding_pooling_strategy: str = "mean_pooling"
+    write_embeddings_to_disk: bool = True
 
     # Clustering config
     clustering_save_loc: str = "clustering_results"

diff --git a/nemo_curator/modules/fuzzy_dedup/minhash.py b/nemo_curator/modules/fuzzy_dedup/minhash.py
@@ -39,7 +39,7 @@ def __init__(
         self,
         seed: int = 42,
         num_hashes: int = 260,
-        char_ngrams: int = 5,
+        char_ngrams: int = 24,
         use_64bit_hash: bool = False,
         logger: Union[logging.LoggerAdapter, str] = "./",
         id_field: str = "id",

diff --git a/nemo_curator/modules/semantic_dedup/embeddings.py b/nemo_curator/modules/semantic_dedup/embeddings.py
@@ -241,6 +241,7 @@ def __call__(self, dataset: DocumentDataset) -> DocumentDataset:
                 )
             )
         else:
+            embedding_ddf = self.create_embeddings(dataset.df, self.input_column)
             ddf = DocumentDataset(embedding_ddf)
 
         self.logger.info(

diff --git a/nemo_curator/modules/semantic_dedup/semdedup.py b/nemo_curator/modules/semantic_dedup/semdedup.py
@@ -51,6 +51,7 @@ def __init__(
             embedding_pooling_strategy=config.embedding_pooling_strategy,
             input_column=input_column,
             embedding_output_dir=os.path.join(cache_dir, config.embeddings_save_loc),
+            write_embeddings_to_disk=config.write_embeddings_to_disk,
             logger=logger,
             profile_dir=self.config.profile_dir,
         )

diff --git a/nemo_curator/package_info.py b/nemo_curator/package_info.py
@@ -14,10 +14,10 @@
 
 
 MAJOR = 0
-MINOR = 6
+MINOR = 7
 PATCH = 0
-PRE_RELEASE = "rc0"
-DEV = "dev1"
+PRE_RELEASE = "rc1"
+DEV = "dev0"
 
 # Use the following formatting: (major, minor, patch, pre-release)
 VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE, DEV)

diff --git a/nemo_curator/sample_dataframe.py b/nemo_curator/sample_dataframe.py
-Original file line number
+Diff line change
@@ Expand Up @@
                     )
                 )
             else:
+                embedding_ddf = self.create_embeddings(dataset.df, self.input_column)
                 ddf = DocumentDataset(embedding_ddf)
             self.logger.info(
@@ Expand Down @@