feat: Add NVIDIA embedder and update eval scripts for MIRACL benchmark

zanussbaum · zanussbaum · commit dffc5e1df166 · 2025-04-04T15:12:51.000Z
diff --git a/FlagEmbedding/abc/evaluation/evaluator.py b/FlagEmbedding/abc/evaluation/evaluator.py
@@ -158,6 +158,7 @@ def __call__(
 
         no_reranker_search_results_dict = {}
         if flag:
+            print(f"{retriever} is running..., loading corpus for {dataset_name}")
             corpus = self.data_loader.load_corpus(dataset_name=dataset_name)
 
             queries_dict = {
diff --git a/FlagEmbedding/inference/auto_embedder.py b/FlagEmbedding/inference/auto_embedder.py
@@ -7,7 +7,7 @@
     AUTO_EMBEDDER_MAPPING, EMBEDDER_CLASS_MAPPING
 )
 
-from FlagEmbedding.inference.embedder.encoder_only import VoyageModel
+from FlagEmbedding.inference.embedder.encoder_only import VoyageModel, NvidiaModel
 
 logger = logging.getLogger(__name__)
 
@@ -70,6 +70,11 @@ def from_finetuned(
             model_class = VoyageModel
             _model_class = VoyageModel
 
+        elif "nvidia" in model_name_or_path:
+            model_name = "nvidia"
+            model_class = NvidiaModel
+            _model_class = NvidiaModel
+
         elif model_class is not None:
             _model_class = EMBEDDER_CLASS_MAPPING[EmbedderModelClass(model_class)]
             if pooling_method is None:
diff --git a/FlagEmbedding/inference/embedder/encoder_only/__init__.py b/FlagEmbedding/inference/embedder/encoder_only/__init__.py
@@ -2,10 +2,11 @@
 from .m3 import M3Embedder as BGEM3FlagModel
 #from .nomic import NomicEmbedder as NomicModel
 from .voyage import VoyageEmbedder as VoyageModel
-
+from .nvidia import NvidiaEmbedder as NvidiaModel
 __all__ = [
     "FlagModel",
     "BGEM3FlagModel",
 #    "NomicModel",
-    "VoyageModel"
+    "VoyageModel",
+    "NvidiaModel"
 ]
diff --git a/FlagEmbedding/inference/embedder/encoder_only/nvidia.py b/FlagEmbedding/inference/embedder/encoder_only/nvidia.py
@@ -0,0 +1,169 @@
+import os
+import torch
+import numpy as np
+from typing import List, Dict, Optional, Union
+from tqdm import tqdm
+from openai import OpenAI
+from tenacity import retry, stop_after_attempt, wait_exponential
+from FlagEmbedding.abc.inference import AbsEmbedder
+from transformers.configuration_utils import PretrainedConfig
+
+
+class NvidiaEmbedderConfig(PretrainedConfig):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self._name_or_path = "nvidia"
+
+
+class NvidiaMockModel:
+    def __init__(self):
+        self.config = NvidiaEmbedderConfig()
+
+
+class NvidiaEmbedder(AbsEmbedder):
+    def __init__(
+        self,
+        model_name_or_path: str,
+        normalize_embeddings: bool = True,
+        use_fp16: bool = True,
+        query_instruction_for_retrieval: Optional[str] = None,
+        query_instruction_format: str = "{}{}",
+        devices: Optional[Union[str, List[str]]] = None,
+        batch_size: int = 2048,
+        query_max_length: int = 512,
+        passage_max_length: int = 512,
+        convert_to_numpy: bool = True,
+        **kwargs
+    ):
+        super().__init__(
+            model_name_or_path,
+            normalize_embeddings=normalize_embeddings,
+            use_fp16=use_fp16,
+            query_instruction_for_retrieval=query_instruction_for_retrieval,
+            query_instruction_format=query_instruction_format,
+            devices=devices,
+            batch_size=batch_size,
+            query_max_length=query_max_length,
+            passage_max_length=passage_max_length,
+            convert_to_numpy=convert_to_numpy,
+            **kwargs
+        )
+        
+        self.model = NvidiaMockModel()
+        self.client = OpenAI(
+            api_key="not-needed",  # API key not needed for local server
+            base_url="http://localhost:8000/v1"
+        )
+        self.model_name = "nvidia/llama-3.2-nv-embedqa-1b-v2"
+
+    @retry(
+        stop=stop_after_attempt(5),
+        wait=wait_exponential(multiplier=1, min=4, max=30),
+        reraise=True
+    )
+    def _get_embeddings(self, texts: List[str], input_type: str = "query") -> np.ndarray:
+        """Get embeddings for a batch of texts with automatic retries and truncation on token size errors."""
+        def try_with_texts(current_texts: List[str], retry_count: int = 0) -> np.ndarray:
+            try:
+                response = self.client.embeddings.create(
+                    input=current_texts,
+                    model=self.model_name,
+                    encoding_format="float",
+                    extra_body={"input_type": input_type, "truncate": "END"}
+                )
+                return np.array([data.embedding for data in response.data])
+            except Exception as e:
+                error_str = str(e)
+                print(f"Error in _get_embeddings: {error_str}")
+                
+                # If we hit token size limit and haven't retried too many times, truncate and retry
+                if "token size" in error_str.lower() and retry_count < 3:
+                    truncated_texts = [t[:len(t)//2] for t in current_texts]
+                    print(f"Retrying with truncated texts (retry {retry_count + 1})")
+                    return try_with_texts(truncated_texts, retry_count + 1)
+                
+                raise
+        
+        return try_with_texts(texts)
+
+    def encode_queries(self, queries: List[str], batch_size: int = 128, **kwargs) -> np.ndarray:
+        """Encode queries with input_type='query'."""
+        all_embeddings = []
+        for i in tqdm(range(0, len(queries), batch_size), desc="Encoding queries"):
+            batch = queries[i:i + batch_size]
+            embeddings = self._get_embeddings(batch, input_type="query")
+            all_embeddings.append(embeddings)
+        return np.vstack(all_embeddings)
+
+    @staticmethod
+    def _process_batch_static(args):
+        """Static method to process a batch of passages."""
+        index, batch, model_name, base_url = args
+        try:
+            # Create a new client for each process
+            client = OpenAI(
+                api_key="not-needed",
+                base_url=base_url
+            )
+            
+            # Get embeddings
+            response = client.embeddings.create(
+                input=batch,
+                model=model_name,
+                encoding_format="float",
+                extra_body={"input_type": "passage", "truncate": "END"}
+            )
+            embeddings = np.array([data.embedding for data in response.data])
+            return index, embeddings
+        except Exception as e:
+            print(f"Error processing batch {index}: {str(e)}")
+            return index, None
+
+    def encode_corpus(self, corpus: List[Dict[str, str]], batch_size: int = 128, num_processes: int = 16, **kwargs) -> np.ndarray:
+        """Encode corpus passages with input_type='passage' using multiple processes."""
+        if isinstance(corpus[0], dict):
+            passages = [f"{doc.get('title', '')} {doc['text']}".strip() for doc in corpus]
+        else:
+            passages = corpus
+
+        # Prepare batches with their indices and required parameters
+        batches = []
+        for i in range(0, len(passages), batch_size):
+            batch = passages[i:i + batch_size]
+            # Include model name and base URL for each batch
+            batches.append((len(batches), batch, self.model_name, self.client.base_url))
+
+        # Process batches in parallel
+        from multiprocessing import Pool
+        with Pool(processes=num_processes) as pool:
+            # Use tqdm to show progress
+            results = list(tqdm(
+                pool.imap(self._process_batch_static, batches),
+                total=len(batches),
+                desc=f"Encoding corpus with {num_processes} processes"
+            ))
+
+        # Sort results by index and collect embeddings
+        sorted_results = sorted(results, key=lambda x: x[0])
+        all_embeddings = []
+        for _, embeddings in sorted_results:
+            if embeddings is not None:
+                all_embeddings.append(embeddings)
+            else:
+                raise Exception("One or more batches failed to process")
+
+        return np.vstack(all_embeddings)
+
+    @torch.no_grad()
+    def encode_single_device(
+        self,
+        sentences: Union[List[str], str],
+        batch_size: int = 128,
+        max_length: int = 512,
+        convert_to_numpy: bool = True,
+        device: Optional[str] = None,
+    ):
+        """Single device encoding method that defaults to query encoding."""
+        if isinstance(sentences, str):
+            sentences = [sentences]
+        return self.encode_queries(sentences, batch_size=batch_size)
diff --git a/examples/evaluation/miracl/eval_miracl.sh b/examples/evaluation/miracl/eval_miracl.sh
@@ -4,7 +4,7 @@ fi
 
 dataset_names="ar bn de en es fa fi fr hi id ja ko ru sw te th yo zh"
 
-VENV="/home/ubuntu/bstadt-smol/flagemb/env"
+VENV="/home/ubuntu/FlagEmbedding/.venv"
 source $VENV/bin/activate
 
 eval_args="\
@@ -24,16 +24,14 @@ eval_args="\
 "
 
 model_args="\
-    --embedder_name_or_path voyage
-    --devices cuda:1 \
+    --embedder_name_or_path nvidia
+    --devices cuda:0 \
     --trust_remote_code \
-    --query_instruction_for_retrieval 'search_query: ' \
-    --passage_instruction_for_retrieval 'search_document: ' \
-    --embedder_batch_size 32 \
+    --embedder_batch_size 1024 \
     --cache_dir $HF_HUB_CACHE 
 "
 
-cmd="/home/ubuntu/bstadt-smol/flagemb/env/bin/python -m FlagEmbedding.evaluation.miracl \
+cmd="/home/ubuntu/FlagEmbedding/.venv/bin/python -m FlagEmbedding.evaluation.miracl \
     $eval_args \
     $model_args \
 "