NVIDIA · vinay-raman · Feb 5, 2025 · Feb 5, 2025
diff --git a/nemo_curator/modules/semantic_dedup.py b/nemo_curator/modules/semantic_dedup.py
@@ -337,7 +337,7 @@ def __call__(self, embeddings_dataset: DocumentDataset):
             )
 
         with performance_report_if_with_ts_suffix(self.profile_dir, "clustering-model"):
-            embeddings_df = embeddings_df[[self.id_col, self.embedding_col]]
+            # embeddings_df = embeddings_df[[self.id_col, self.embedding_col]]
 
             embeddings_df = embeddings_df.to_backend("pandas").persist()
             embeddings_df = embeddings_df.repartition(

diff --git a/tutorials/nemo-retriever-synthetic-data-generation/config/config.py b/tutorials/nemo-retriever-synthetic-data-generation/config/config.py
@@ -92,3 +92,30 @@ class RetrieverEvalSDGConfig(BaseConfig):
     Question:
     {question}
     """
+
+
+@dataclass
+class RetrieverHardNegativeMiningConfig(BaseConfig):
+    """
+    Configuration for mining hard negatives
+
+    Attributes:
+
+    """
+
+    model_name: str = "nvdev/nvidia/llama-3.2-nv-embedqa-1b-v2"
+    model_type: str = "nvidia"
+    base_url: str = "https://integrate.api.nvidia.com/v1"
+    api_key: str = None
+    truncate: str = "END"
+    hard_negatives_to_mine: int = 4
+    hard_neg_mining_algorithm: str = "topk_abs"
+    max_hardness_threshold: float = 0.75
+    min_hardness_threshold: float = 0
+    query_prefix: str = ""
+    passage_prefix: str = ""
+    percpos: float = 0.95
+    min_cluster_size: int = 50
+    max_number_clusters: int = 200
+    cluster_output_dir: str = "/tmp/clusters"
+    logger_output_dir: str = "/tmp/logs"
diff --git a/tutorials/nemo-retriever-synthetic-data-generation/config/hard-negative-mining-config.yaml b/tutorials/nemo-retriever-synthetic-data-generation/config/hard-negative-mining-config.yaml
@@ -0,0 +1,23 @@
+# HARD-NEGATIVE MINING parameters
+# base_url: "https://integrate.api.nvidia.com/v1"
+model_name: "intfloat/e5-large-unsupervised" 
+# "intfloat/e5-large-unsupervised" 
+# "nvdev/nvidia/llama-3.2-nv-embedqa-1b-v1"
+# nvdev/nvidia/nv-embedqa-e5-v5"
+# "nvdev/nvidia/llama-3.2-nv-embedqa-1b-v1"
+model_type: "hf"
+query_prefix: "query:"
+passage_prefix: "passage:"
+api_key: "your api key here"
+truncate: "END"
+hard_negatives_to_mine: 4
+hard_neg_mining_algorithm: "topk_percpos"
+percpos: 0.95
+# max_hardness_threshold: 0.75
+# min_hardness_threshold: 0
+
+# SEMANTIC CLUSTERING parameters
+min_cluster_size: 50
+max_number_clusters: 200
+cluster_output_dir: "/home/vinay/qa_nemo_curator/clusters"
+logger_output_dir: "/home/vinay/qa_nemo_curator/logs"
diff --git a/tutorials/nemo-retriever-synthetic-data-generation/mine_hard_negatives.py b/tutorials/nemo-retriever-synthetic-data-generation/mine_hard_negatives.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import importlib
+import os
+import shutil
+from typing import Any, List
+import pdb
+from tqdm.dask import TqdmCallback
+import glob
+import time
+
+from nemo_curator.datasets import DocumentDataset
+from nemo_curator.utils.file_utils import get_all_files_paths_under 
+from retriever_hardnegative_miner import HardNegativeMiner
+from config.config import RetrieverHardNegativeMiningConfig
+from nemo_curator.utils.distributed_utils import get_client
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input-dir",
+        type=str,
+        default="",
+        help="Input dir path containing annotated data files in jsonl format",
+    )
+    parser.add_argument(
+        "--hard-negative-mining-config",
+        type=str,
+        default="",
+        help="Configuration yaml file path containing config for hard negative mining",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="",
+        help="Output file containing hard negatives",
+    )
+    parser.add_argument(
+        "--api-key",
+        type=str,
+        default=None,
+        help="The API key to use for the synthetic data generation LLM client.",
+    )
+    parser.add_argument(
+        "--api-timeout",
+        type=int,
+        default=120,
+        help="The timeout value for API calls in seconds.",
+    )
+    args = parser.parse_args()
+
+    if not os.path.exists(args.input_dir):
+        raise ValueError("Input directory not found")
+
+    if os.path.exists(args.output_dir):
+       raise ValueError("Output dir exists already, use a new file name!")
+
+    if args.input_dir:
+        input_dataset = DocumentDataset.read_json(args.input_dir)
+    else:
+        raise ValueError("provide input file path")
+
+    if args.hard_negative_mining_config:
+        cfg = RetrieverHardNegativeMiningConfig.from_yaml(args.hard_negative_mining_config)
+    else:
+        raise ValueError("provide config for hard negative mining")
+    if args.api_key:
+        cfg.api_key = args.api_key
+
+    mine_hard_negatives = HardNegativeMiner(cfg)
+    print("Mining hard negatives ...")
+    st_time =time.time()
+    mined_dataset = mine_hard_negatives(input_dataset)
+
+    print ("Time taken = {:.2f}".format(time.time()-st_time))
+    print ("Saving data in jsonl format ...")
+    mined_dataset.df.to_json(os.path.join(args.output_dir, "mined_dataset"), 
+                             lines=True,
+                             orient='records')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tutorials/nemo-retriever-synthetic-data-generation/notebooks/quickstart.ipynb b/tutorials/nemo-retriever-synthetic-data-generation/notebooks/quickstart.ipynb
@@ -53,7 +53,7 @@
     "{\"_id\": \"doc3\", \"text\": \"The Eiffel Tower is an iron lattice tower on the Champ de Mars in Paris.\", \"title\": \"Iconic Landmark\" }\n",
     "...\n",
     "```\n",
-    "This repository contains a sample JSONL file `data/sample_data.jsonl`."
+    "This repository contains a sample JSONL file `sample_data/sample_data.jsonl`."
    ]
   },
   {
@@ -110,7 +110,7 @@
    "outputs": [],
    "source": [
     "import pandas as pd\n",
-    "df = pd.read_json(\"../data/sample_data_rawdoc.jsonl\", lines=True)"
+    "df = pd.read_json(\"../sample_data/sample_data_rawdoc.jsonl\", lines=True)"
    ]
   },
   {
@@ -850,7 +850,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,

diff --git a/tutorials/nemo-retriever-synthetic-data-generation/repartition.py b/tutorials/nemo-retriever-synthetic-data-generation/repartition.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import importlib
+import os
+import shutil
+from typing import Any, List
+import pdb
+from tqdm.dask import TqdmCallback
+import glob
+import time
+
+from nemo_curator.datasets import DocumentDataset
+from nemo_curator.utils.file_utils import get_all_files_paths_under 
+from retriever_hardnegative_miner import HardNegativeMiner
+from config.config import RetrieverHardNegativeMiningConfig
+from nemo_curator.utils.distributed_utils import get_client
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input-dir",
+        type=str,
+        default="",
+        help="Input dir path containing annotated data files in jsonl format",
+    )
+    parser.add_argument(
+        "--hard-negative-mining-config",
+        type=str,
+        default="",
+        help="Configuration yaml file path containing config for hard negative mining",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="",
+        help="Output file containing clustered dataset",
+    )
+    parser.add_argument(
+        "--api-key",
+        type=str,
+        default=None,
+        help="The API key to use for the synthetic data generation LLM client.",
+    )
+    parser.add_argument(
+        "--api-timeout",
+        type=int,
+        default=120,
+        help="The timeout value for API calls in seconds.",
+    )
+    args = parser.parse_args()
+
+    if not os.path.exists(args.input_dir):
+        raise ValueError("Input directory not found")
+
+    if os.path.exists(args.output_dir):
+       raise ValueError("Output dir exists already, use a new file name!")
+
+    if args.input_dir:
+        input_files = get_all_files_paths_under(args.input_dir,
+                                                keep_extensions="jsonl")
+        input_dataset = DocumentDataset.read_json(input_files)
+    else:
+        raise ValueError("provide input file path")
+    if args.hard_negative_mining_config:
+        cfg = RetrieverHardNegativeMiningConfig.from_yaml(args.hard_negative_mining_config)
+    else:
+        raise ValueError("provide config for hard negative mining")
+    if args.api_key:
+        cfg.api_key = args.api_key
+
+    dask_client = get_client(cluster_type="gpu")
+
+    miner = HardNegativeMiner(cfg)
+    clustered_dataset = miner.repartition_semantic_similarity(input_dataset)
+
+    # saving clustered dataset
+    st_time = time.time()
+    clustered_dataset.df.to_json(os.path.join(args.output_dir,"clustered_dataset"))
+    print ("Time taken to cluster data = {:.2f} s".format(time.time()-st_time))
+
+
+
+if __name__ == "__main__":
+    main()