From bbbf0bb2761d22503a6be9b1e6d25c966171ab4b Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Wed, 12 Mar 2025 11:55:02 -0700 Subject: [PATCH] add async llm modifier and llm modifier Signed-off-by: Sarah Yurick --- examples/README.md | 32 +-- examples/async_llm_pii_redaction.py | 53 +++++ examples/llm_pii_redaction.py | 52 ++++ .../modifiers/async_llm_pii_modifier.py | 214 +++++++++++++++++ nemo_curator/modifiers/llm_pii_modifier.py | 176 ++++++++++++++ nemo_curator/utils/llm_pii_utils.py | 222 ++++++++++++++++++ 6 files changed, 734 insertions(+), 15 deletions(-) create mode 100644 examples/async_llm_pii_redaction.py create mode 100644 examples/llm_pii_redaction.py create mode 100644 nemo_curator/modifiers/async_llm_pii_modifier.py create mode 100644 nemo_curator/modifiers/llm_pii_modifier.py create mode 100644 nemo_curator/utils/llm_pii_utils.py diff --git a/examples/README.md b/examples/README.md index 29545978c..6a1b9c623 100644 --- a/examples/README.md +++ b/examples/README.md @@ -4,21 +4,23 @@ This directory contains multiple Python scripts with examples of how to use vari The goal of these examples is to give the user an overview of many of the ways your text data can be curated. These include: -| Python Script | Description | -|---------------------------------------|---------------------------------------------------------------------------------------------------------------| -| blend_and_shuffle.py | Combine multiple datasets into one with different amounts of each dataset, then randomly permute the dataset. | -| classifier_filtering.py | Train a fastText classifier, then use it to filter high and low quality data. | -| download_arxiv.py | Download Arxiv tar files and extract them. | -| download_common_crawl.py | Download Common Crawl WARC snapshots and extract them. | -| download_wikipedia.py | Download the latest Wikipedia dumps and extract them. | -| exact_deduplication.py | Use the `ExactDuplicates` class to perform exact deduplication on text data. | -| find_pii_and_deidentify.py | Use the `PiiModifier` and `Modify` classes to remove personally identifiable information from text data. | -| fuzzy_deduplication.py | Use the `FuzzyDuplicatesConfig` and `FuzzyDuplicates` classes to perform fuzzy deduplication on text data. | -| identify_languages.py | Use `FastTextLangId` to filter data by language | -| raw_download_common_crawl.py | Download the raw compressed WARC files from Common Crawl without extracting them. | -| semdedup_example.py | Use the `SemDedup` class to perform semantic deduplication on text data. | -| task_decontamination.py | Remove segments of downstream evaluation tasks from a dataset. | -| translation_example.py | Create and use an `IndicTranslation` model for language translation. | +| Python Script | Description | +|---------------------------------------|------------------------------------------------------------------------------------------------------------------| +| async_llm_pii_redaction.py | Use the `AsyncLLMPiiModifier` and `Modify` classes to remove personally identifiable information from text data. | +| blend_and_shuffle.py | Combine multiple datasets into one with different amounts of each dataset, then randomly permute the dataset. | +| classifier_filtering.py | Train a fastText classifier, then use it to filter high and low quality data. | +| download_arxiv.py | Download Arxiv tar files and extract them. | +| download_common_crawl.py | Download Common Crawl WARC snapshots and extract them. | +| download_wikipedia.py | Download the latest Wikipedia dumps and extract them. | +| exact_deduplication.py | Use the `ExactDuplicates` class to perform exact deduplication on text data. | +| find_pii_and_deidentify.py | Use the `PiiModifier` and `Modify` classes to remove personally identifiable information from text data. | +| fuzzy_deduplication.py | Use the `FuzzyDuplicatesConfig` and `FuzzyDuplicates` classes to perform fuzzy deduplication on text data. | +| identify_languages.py | Use `FastTextLangId` to filter data by language | +| llm_pii_redaction.py | Use the `LLMPiiModifier` and `Modify` classes to remove personally identifiable information from text data. | +| raw_download_common_crawl.py | Download the raw compressed WARC files from Common Crawl without extracting them. | +| semdedup_example.py | Use the `SemDedup` class to perform semantic deduplication on text data. | +| task_decontamination.py | Remove segments of downstream evaluation tasks from a dataset. | +| translation_example.py | Create and use an `IndicTranslation` model for language translation. | Before running any of these scripts, we strongly recommend displaying `python