sarahyurick
diff --git a/‎docs/user-guide/index.rst
Lines changed: 3 additions & 0 deletions b/‎docs/user-guide/index.rst
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/user-guide/syntheticdata.rst
Lines changed: 18 additions & 0 deletions b/‎docs/user-guide/syntheticdata.rst
Lines changed: 18 additions & 0 deletions
diff --git a/‎nemo_curator/__init__.py
Lines changed: 7 additions & 0 deletions b/‎nemo_curator/__init__.py
Lines changed: 7 additions & 0 deletions
diff --git a/‎nemo_curator/datasets/doc_dataset.py
Lines changed: 39 additions & 1 deletion b/‎nemo_curator/datasets/doc_dataset.py
Lines changed: 39 additions & 1 deletion
diff --git a/‎nemo_curator/services/__init__.py
Lines changed: 26 additions & 0 deletions b/‎nemo_curator/services/__init__.py
Lines changed: 26 additions & 0 deletions
diff --git a/‎nemo_curator/services/conversation_formatter.py
Lines changed: 28 additions & 0 deletions b/‎nemo_curator/services/conversation_formatter.py
Lines changed: 28 additions & 0 deletions
diff --git a/‎nemo_curator/services/model_client.py
Lines changed: 93 additions & 0 deletions b/‎nemo_curator/services/model_client.py
Lines changed: 93 additions & 0 deletions
diff --git a/‎nemo_curator/services/nemo_client.py
Lines changed: 100 additions & 0 deletions b/‎nemo_curator/services/nemo_client.py
Lines changed: 100 additions & 0 deletions
@@ -18,6 +18,9 @@
 :ref:`GPU Accelerated Exact and Fuzzy Deduplication <data-curator-gpu-deduplication>`
    Both exact and fuzzy deduplication functionalities are supported in NeMo Curator and accelerated using RAPIDS cuDF.
 
+:ref:`Synthetic Data Generation <data-curator-syntheticdata>`
+   Synthetic data generation tools and example piplines are available within NeMo Curator.
+
 :ref:`Downstream Task Decontamination <data-curator-downstream>`
    After training, large language models are usually evaluated by their performance on downstream tasks consisting of unseen test data. When dealing with large datasets, there is a potential for leakage of this test data into the model’s training dataset. NeMo Curator allows you to remove sections of documents in your dataset that are present in downstream tasks.
 
 
@@ -0,0 +1,18 @@
+
+.. _data-curator-syntheticdata:
+
+======================================
+Synthetic Data Generation
+======================================
+--------------------------------------
+Background
+--------------------------------------
+Synthetic data generation has become increasing useful in large language model training.
+It is used in pretraining, fine-tuning, and evalutation.
+Synthetically generated data can be useful for adapting an LLM to low resource languages/domains, or performing knowledge distillation from other models among other purposes.
+There are a variety of ways to construct synthetic data generation pipelines, with numerous LLM and classical filters.
+
+NeMo Curator has a simple, easy-to-use set of tools that allow you to use prebuilt synthetic generation pipelines or build your own.
+Any model inference service that uses the OpenAI API is compatible with the synthetic data generation module, allowing you to generate your data from any model.
+NeMo Curator has prebuilt synthetic data generation pipelines for supervised fine-tuning (SFT) and preference data that were used to generate data for the training of `Nemotron-4 340B <https://research.nvidia.com/publication/2024-06_nemotron-4-340b>`_.
+And, you can easily interweave filtering and deduplication steps in your synthetic data pipeline with the other modules in NeMo Curator.
@@ -34,6 +34,13 @@
 
 
 from .modules import *
+from .services import (
+    AsyncLLMClient,
+    AsyncOpenAIClient,
+    LLMClient,
+    NemoDeployClient,
+    OpenAIClient,
+)
 from .utils.distributed_utils import get_client
 
 # Dask will automatically convert the list score type
 
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Union
+from typing import List, Optional, Union
 
 import dask.dataframe as dd
 
@@ -130,6 +130,44 @@ def to_pickle(
     ):
         raise NotImplementedError("DocumentDataset does not support to_pickle yet")
 
+    @classmethod
+    def from_pandas(
+        cls,
+        data,
+        npartitions: Optional[int] = 1,
+        chunksize: Optional[int] = None,
+        sort: Optional[bool] = True,
+        name: Optional[str] = None,
+    ):
+        """
+        Creates a document dataset from a pandas data frame.
+        For more information on the arguments see Dask's from_pandas documentation
+        https://docs.dask.org/en/stable/generated/dask.dataframe.from_pandas.html
+
+        Args:
+            data: A pandas dataframe
+        Returns:
+            A document dataset with a pandas backend (on the CPU).
+        """
+        return cls(
+            dd.from_pandas(
+                data=data,
+                npartitions=npartitions,
+                chunksize=chunksize,
+                sort=sort,
+                name=name,
+            )
+        )
+
+    def to_pandas(self):
+        """
+        Creates a pandas dataframe from a DocumentDataset
+
+        Returns:
+            A pandas dataframe (on the CPU)
+        """
+        return self.df.to_backend("pandas").compute()
+
 
 def _read_json_or_parquet(
     input_files: Union[str, List[str]],
 
@@ -0,0 +1,26 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .conversation_formatter import ConversationFormatter
+from .model_client import AsyncLLMClient, LLMClient
+from .nemo_client import NemoDeployClient
+from .openai_client import AsyncOpenAIClient, OpenAIClient
+
+__all__ = [
+    "AsyncLLMClient",
+    "LLMClient",
+    "AsyncOpenAIClient",
+    "OpenAIClient",
+    "NemoDeployClient",
+    "ConversationFormatter",
+]
@@ -0,0 +1,28 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC, abstractmethod
+from typing import List
+
+
+class ConversationFormatter(ABC):
+    """
+    Represents a way of formatting a conversation with an LLM
+    such that it can response appropriately
+    """
+
+    @abstractmethod
+    def format_conversation(self, conv: List[dict]) -> str:
+        raise NotImplementedError(
+            "format_converstaion must be implemented by subclasses"
+        )
@@ -0,0 +1,93 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC, abstractmethod
+from typing import Iterable, List, Optional, Union
+
+from nemo_curator.services.conversation_formatter import ConversationFormatter
+
+
+class LLMClient(ABC):
+    """
+    Interface representing a client connecting to an LLM inference server
+    and making requests synchronously
+    """
+
+    @abstractmethod
+    def query_model(
+        self,
+        *,
+        messages: Iterable,
+        model: str,
+        conversation_formatter: Optional[ConversationFormatter] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = 1,
+        seed: Optional[int] = None,
+        stop: Union[Optional[str], List[str]] = None,
+        stream: bool = False,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+    ) -> List[str]:
+        raise NotImplementedError("Subclass of LLMClient must implement 'query_model'")
+
+    @abstractmethod
+    def query_reward_model(
+        self,
+        *,
+        messages: Iterable,
+        model: str,
+        conversation_formatter: Optional[ConversationFormatter] = None,
+    ) -> dict:
+        raise NotImplementedError(
+            "Subclass of LLMClient must implement 'query_reward_model'"
+        )
+
+
+class AsyncLLMClient(ABC):
+    """
+    Interface representing a client connecting to an LLM inference server
+    and making requests asynchronously
+    """
+
+    @abstractmethod
+    async def query_model(
+        self,
+        *,
+        messages: Iterable,
+        model: str,
+        conversation_formatter: Optional[ConversationFormatter] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = 1,
+        seed: Optional[int] = None,
+        stop: Union[Optional[str], List[str]] = None,
+        stream: bool = False,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+    ) -> List[str]:
+        raise NotImplementedError(
+            "Subclass of AsyncLLMClient must implement 'query_model'"
+        )
+
+    @abstractmethod
+    async def query_reward_model(
+        self,
+        *,
+        messages: Iterable,
+        model: str,
+        conversation_formatter: Optional[ConversationFormatter] = None,
+    ) -> dict:
+        raise NotImplementedError(
+            "Subclass of LLMClient must implement 'query_reward_model'"
+        )
@@ -0,0 +1,100 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from typing import Iterable, List, Optional, Union
+
+from nemo_curator.services.conversation_formatter import ConversationFormatter
+from nemo_curator.utils.import_utils import safe_import_from
+
+from .model_client import AsyncLLMClient, LLMClient
+
+NemoQueryLLM = safe_import_from("nemo.deploy.nlp", "NemoQueryLLM")
+
+
+class NemoDeployClient(LLMClient):
+    """
+    A wrapper around NemoQueryLLM for querying models in synthetic data generation
+    """
+
+    def __init__(self, nemo_deploy: NemoQueryLLM) -> None:
+        self.client = nemo_deploy
+
+    def query_model(
+        self,
+        *,
+        messages: Iterable,
+        model: str,
+        conversation_formatter: Optional[ConversationFormatter] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        seed: Optional[int] = None,
+        stop: Union[Optional[str], List[str]] = None,
+        stream: bool = False,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+    ) -> List[str]:
+        if conversation_formatter is None:
+            raise ValueError(
+                "NemoDeployClient's query_model requires a conversation_formatter"
+            )
+
+        prompt = conversation_formatter.format_conversation(messages)
+        self.client.model_name = model
+
+        if n is not None:
+            warnings.warn("n is not supported in NemoDeployClient")
+        if stream:
+            warnings.warn("streamming is not supported in NeMoDeployClient")
+
+        if isinstance(stop, str):
+            stop = [stop]
+
+        response = self.client.query_llm(
+            prompts=[prompt],
+            max_output_len=max_tokens,
+            random_seed=seed,
+            stop_words_list=stop,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+        )[0]
+
+        return self._postprocess_response(response, stop)
+
+    @staticmethod
+    def _postprocess_response(responses: List[str], stop_words: List[str]) -> List[str]:
+        processed_responses = []
+        for response in responses:
+            for stop in stop_words:
+                if response.endswith(stop):
+                    response = response[: -len(stop)]
+            processed_responses.append(response.strip())
+        return processed_responses
+
+    def query_reward_model(self, *, messages: Iterable, model: str) -> dict:
+        """
+        Prompts an LLM Reward model to score a conversation between a user and assistant
+        Args:
+            messages: The conversation to calculate a score for.
+                Should be formatted like:
+                    [{"role": "user", "content": "Write a sentence"}, {"role": "assistant", "content": "This is a sentence"}, ...]
+            model: The name of the model that should be used to calculate the reward.
+                Must be a reward model, cannot be a regular LLM.
+        Returns:
+            A mapping of score_name -> score
+        """
+        raise NotImplementedError(
+            "Reward model inference is not supported in NeMo Deploy Clients"
+        )