From 7d2b709e7631db87d9b5630f0a6f8325f5d7e3db Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 10 Jan 2025 13:04:16 -0800 Subject: [PATCH 1/2] Add nemo-toolkit dependency Signed-off-by: Sarah Yurick --- pyproject.toml | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 329c2eec4..b275a189c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,7 @@ dependencies = [ "distributed>=2021.7.1", "fasttext==0.9.3", "ftfy==6.1.1", + "huggingface-hub", "in-place==0.5.0", "jieba==0.42.1", "justext==3.0.1", @@ -67,6 +68,7 @@ dependencies = [ "spacy>=3.6.0, <3.8.0", # TODO: Remove this pin once newer version is released "transformers==4.46.3", + "tqdm", "unidic-lite==1.0.8", "usaddress==0.5.10", "warcio==1.7.4", @@ -107,20 +109,14 @@ image_nightly = [ "timm>=1.0.8", "nemo_curator[cuda12x_nightly]", ] -# Installs bitext curation modules -bitext = [ - "huggingface-hub", - "tqdm", - "transformers", - "nemo_curator[cuda12x]", -] # Installs all of the above with Stable RAPIDS all = [ + "nemo-toolkit", "nemo_curator[image]", - "nemo_curator[bitext]", ] # Installs all of the above with RAPIDS Nightlies all_nightly = [ + "nemo-toolkit", "nemo_curator[image_nightly]", ] From 0e834fd018b61be3f9c7ebebfbe27a15b2f3edb1 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 10 Jan 2025 13:13:02 -0800 Subject: [PATCH 2/2] add nemo-toolkit install to Dockerfile Signed-off-by: Sarah Yurick --- Dockerfile | 2 +- pyproject.toml | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index b0d1bedc7..d81d72a5c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -42,7 +42,7 @@ RUN conda create -y --name curator -c nvidia/label/cuda-${CUDA_VER} -c conda-for libcusolver \ cuda-nvvm && \ source activate curator && \ - pip install --upgrade pytest pip + pip install --upgrade pytest pip nemo-toolkit RUN \ --mount=type=bind,source=/opt/NeMo-Curator/nemo_curator/__init__.py,target=/opt/NeMo-Curator/nemo_curator/__init__.py,from=curator-update \ diff --git a/pyproject.toml b/pyproject.toml index b275a189c..abb0d9e3b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,9 +66,9 @@ dependencies = [ "resiliparse", "sentencepiece", "spacy>=3.6.0, <3.8.0", + "tqdm", # TODO: Remove this pin once newer version is released "transformers==4.46.3", - "tqdm", "unidic-lite==1.0.8", "usaddress==0.5.10", "warcio==1.7.4", @@ -111,12 +111,10 @@ image_nightly = [ ] # Installs all of the above with Stable RAPIDS all = [ - "nemo-toolkit", "nemo_curator[image]", ] # Installs all of the above with RAPIDS Nightlies all_nightly = [ - "nemo-toolkit", "nemo_curator[image_nightly]", ]